github.com/lfch/etcd-io/tests/v3@v3.0.0-20221004140520-eac99acd3e9d/functional/tester/case_sigquit_remove.go (about) 1 // Copyright 2018 The etcd Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tester 16 17 import ( 18 "context" 19 "fmt" 20 "sort" 21 "strings" 22 "time" 23 24 clientv3 "github.com/lfch/etcd-io/client/v3" 25 "github.com/lfch/etcd-io/tests/v3/functional/rpcpb" 26 27 "go.uber.org/zap" 28 ) 29 30 func inject_SIGQUIT_ETCD_AND_REMOVE_DATA(clus *Cluster, idx1 int) error { 31 cli1, err := clus.Members[idx1].CreateEtcdClient() 32 if err != nil { 33 return err 34 } 35 defer cli1.Close() 36 37 var mresp *clientv3.MemberListResponse 38 mresp, err = cli1.MemberList(context.Background()) 39 var mss []string 40 if err == nil && mresp != nil { 41 mss = describeMembers(mresp) 42 } 43 clus.lg.Info( 44 "member list before disastrous machine failure", 45 zap.String("request-to", clus.Members[idx1].EtcdClientEndpoint), 46 zap.Strings("members", mss), 47 zap.Error(err), 48 ) 49 if err != nil { 50 return err 51 } 52 53 sresp, serr := cli1.Status(context.Background(), clus.Members[idx1].EtcdClientEndpoint) 54 if serr != nil { 55 return serr 56 } 57 id1 := sresp.Header.MemberId 58 is1 := fmt.Sprintf("%016x", id1) 59 60 clus.lg.Info( 61 "disastrous machine failure START", 62 zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint), 63 zap.String("target-member-id", is1), 64 zap.Error(err), 65 ) 66 err = clus.sendOp(idx1, rpcpb.Operation_SIGQUIT_ETCD_AND_REMOVE_DATA) 67 clus.lg.Info( 68 "disastrous machine failure END", 69 zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint), 70 zap.String("target-member-id", is1), 71 zap.Error(err), 72 ) 73 if err != nil { 74 return err 75 } 76 77 time.Sleep(2 * time.Second) 78 79 idx2 := (idx1 + 1) % len(clus.Members) 80 var cli2 *clientv3.Client 81 cli2, err = clus.Members[idx2].CreateEtcdClient() 82 if err != nil { 83 return err 84 } 85 defer cli2.Close() 86 87 // FIXME(bug): this may block forever during 88 // "SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT" 89 // is the new leader too busy with snapshotting? 90 // is raft proposal dropped? 91 // enable client keepalive for failover? 92 clus.lg.Info( 93 "member remove after disaster START", 94 zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint), 95 zap.String("target-member-id", is1), 96 zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint), 97 ) 98 ctx, cancel := context.WithTimeout(context.Background(), time.Minute) 99 _, err = cli2.MemberRemove(ctx, id1) 100 cancel() 101 clus.lg.Info( 102 "member remove after disaster END", 103 zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint), 104 zap.String("target-member-id", is1), 105 zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint), 106 zap.Error(err), 107 ) 108 if err != nil { 109 return err 110 } 111 112 time.Sleep(2 * time.Second) 113 114 mresp, err = cli2.MemberList(context.Background()) 115 mss = []string{} 116 if err == nil && mresp != nil { 117 mss = describeMembers(mresp) 118 } 119 clus.lg.Info( 120 "member list after member remove", 121 zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint), 122 zap.Strings("members", mss), 123 zap.Error(err), 124 ) 125 return err 126 } 127 128 func recover_SIGQUIT_ETCD_AND_REMOVE_DATA(clus *Cluster, idx1 int) error { 129 idx2 := (idx1 + 1) % len(clus.Members) 130 cli2, err := clus.Members[idx2].CreateEtcdClient() 131 if err != nil { 132 return err 133 } 134 defer cli2.Close() 135 136 _, err = cli2.MemberAdd(context.Background(), clus.Members[idx1].Etcd.AdvertisePeerURLs) 137 clus.lg.Info( 138 "member add before fresh restart", 139 zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint), 140 zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint), 141 zap.Error(err), 142 ) 143 if err != nil { 144 return err 145 } 146 147 time.Sleep(2 * time.Second) 148 149 clus.Members[idx1].Etcd.InitialClusterState = "existing" 150 err = clus.sendOp(idx1, rpcpb.Operation_RESTART_ETCD) 151 clus.lg.Info( 152 "fresh restart after member add", 153 zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint), 154 zap.Error(err), 155 ) 156 if err != nil { 157 return err 158 } 159 160 time.Sleep(2 * time.Second) 161 162 var mresp *clientv3.MemberListResponse 163 mresp, err = cli2.MemberList(context.Background()) 164 var mss []string 165 if err == nil && mresp != nil { 166 mss = describeMembers(mresp) 167 } 168 clus.lg.Info( 169 "member list after member add", 170 zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint), 171 zap.Strings("members", mss), 172 zap.Error(err), 173 ) 174 return err 175 } 176 177 func new_Case_SIGQUIT_AND_REMOVE_ONE_FOLLOWER(clus *Cluster) Case { 178 cc := caseByFunc{ 179 rpcpbCase: rpcpb.Case_SIGQUIT_AND_REMOVE_ONE_FOLLOWER, 180 injectMember: inject_SIGQUIT_ETCD_AND_REMOVE_DATA, 181 recoverMember: recover_SIGQUIT_ETCD_AND_REMOVE_DATA, 182 } 183 c := &caseFollower{cc, -1, -1} 184 return &caseDelay{ 185 Case: c, 186 delayDuration: clus.GetCaseDelayDuration(), 187 } 188 } 189 190 func new_Case_SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus *Cluster) Case { 191 return &caseUntilSnapshot{ 192 rpcpbCase: rpcpb.Case_SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT, 193 Case: new_Case_SIGQUIT_AND_REMOVE_ONE_FOLLOWER(clus), 194 } 195 } 196 197 func new_Case_SIGQUIT_AND_REMOVE_LEADER(clus *Cluster) Case { 198 cc := caseByFunc{ 199 rpcpbCase: rpcpb.Case_SIGQUIT_AND_REMOVE_LEADER, 200 injectMember: inject_SIGQUIT_ETCD_AND_REMOVE_DATA, 201 recoverMember: recover_SIGQUIT_ETCD_AND_REMOVE_DATA, 202 } 203 c := &caseLeader{cc, -1, -1} 204 return &caseDelay{ 205 Case: c, 206 delayDuration: clus.GetCaseDelayDuration(), 207 } 208 } 209 210 func new_Case_SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus *Cluster) Case { 211 return &caseUntilSnapshot{ 212 rpcpbCase: rpcpb.Case_SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT, 213 Case: new_Case_SIGQUIT_AND_REMOVE_LEADER(clus), 214 } 215 } 216 217 func describeMembers(mresp *clientv3.MemberListResponse) (ss []string) { 218 ss = make([]string, len(mresp.Members)) 219 for i, m := range mresp.Members { 220 ss[i] = fmt.Sprintf("Name %s / ID %016x / ClientURLs %s / PeerURLs %s", 221 m.Name, 222 m.ID, 223 strings.Join(m.ClientURLs, ","), 224 strings.Join(m.PeerURLs, ","), 225 ) 226 } 227 sort.Strings(ss) 228 return ss 229 }