github.com/lfch/etcd-io/tests/v3@v3.0.0-20221004140520-eac99acd3e9d/functional/tester/case.go (about) 1 // Copyright 2018 The etcd Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tester 16 17 import ( 18 "fmt" 19 "math/rand" 20 "time" 21 22 "github.com/lfch/etcd-io/tests/v3/functional/rpcpb" 23 24 "go.uber.org/zap" 25 ) 26 27 // Case defines failure/test injection interface. 28 // To add a test case: 29 // 1. implement "Case" interface 30 // 2. define fail case name in "rpcpb.Case" 31 type Case interface { 32 // Inject injects the failure into the testing cluster at the given 33 // round. When calling the function, the cluster should be in health. 34 Inject(clus *Cluster) error 35 // Recover recovers the injected failure caused by the injection of the 36 // given round and wait for the recovery of the testing cluster. 37 Recover(clus *Cluster) error 38 // Desc returns a description of the failure 39 Desc() string 40 // TestCase returns "rpcpb.Case" enum type. 41 TestCase() rpcpb.Case 42 } 43 44 type injectMemberFunc func(*Cluster, int) error 45 type recoverMemberFunc func(*Cluster, int) error 46 47 type caseByFunc struct { 48 desc string 49 rpcpbCase rpcpb.Case 50 injectMember injectMemberFunc 51 recoverMember recoverMemberFunc 52 } 53 54 func (c *caseByFunc) Desc() string { 55 if c.desc != "" { 56 return c.desc 57 } 58 return c.rpcpbCase.String() 59 } 60 61 func (c *caseByFunc) TestCase() rpcpb.Case { 62 return c.rpcpbCase 63 } 64 65 type caseFollower struct { 66 caseByFunc 67 last int 68 lead int 69 } 70 71 func (c *caseFollower) updateIndex(clus *Cluster) error { 72 lead, err := clus.GetLeader() 73 if err != nil { 74 return err 75 } 76 c.lead = lead 77 78 n := len(clus.Members) 79 if c.last == -1 { // first run 80 c.last = clus.rd % n 81 if c.last == c.lead { 82 c.last = (c.last + 1) % n 83 } 84 } else { 85 c.last = (c.last + 1) % n 86 if c.last == c.lead { 87 c.last = (c.last + 1) % n 88 } 89 } 90 return nil 91 } 92 93 func (c *caseFollower) Inject(clus *Cluster) error { 94 if err := c.updateIndex(clus); err != nil { 95 return err 96 } 97 return c.injectMember(clus, c.last) 98 } 99 100 func (c *caseFollower) Recover(clus *Cluster) error { 101 return c.recoverMember(clus, c.last) 102 } 103 104 func (c *caseFollower) Desc() string { 105 if c.desc != "" { 106 return c.desc 107 } 108 return c.rpcpbCase.String() 109 } 110 111 func (c *caseFollower) TestCase() rpcpb.Case { 112 return c.rpcpbCase 113 } 114 115 type caseLeader struct { 116 caseByFunc 117 last int 118 lead int 119 } 120 121 func (c *caseLeader) updateIndex(clus *Cluster) error { 122 lead, err := clus.GetLeader() 123 if err != nil { 124 return err 125 } 126 c.lead = lead 127 c.last = lead 128 return nil 129 } 130 131 func (c *caseLeader) Inject(clus *Cluster) error { 132 if err := c.updateIndex(clus); err != nil { 133 return err 134 } 135 return c.injectMember(clus, c.last) 136 } 137 138 func (c *caseLeader) Recover(clus *Cluster) error { 139 return c.recoverMember(clus, c.last) 140 } 141 142 func (c *caseLeader) TestCase() rpcpb.Case { 143 return c.rpcpbCase 144 } 145 146 type caseQuorum struct { 147 caseByFunc 148 injected map[int]struct{} 149 } 150 151 func (c *caseQuorum) Inject(clus *Cluster) error { 152 c.injected = pickQuorum(len(clus.Members)) 153 for idx := range c.injected { 154 if err := c.injectMember(clus, idx); err != nil { 155 return err 156 } 157 } 158 return nil 159 } 160 161 func (c *caseQuorum) Recover(clus *Cluster) error { 162 for idx := range c.injected { 163 if err := c.recoverMember(clus, idx); err != nil { 164 return err 165 } 166 } 167 return nil 168 } 169 170 func (c *caseQuorum) Desc() string { 171 if c.desc != "" { 172 return c.desc 173 } 174 return c.rpcpbCase.String() 175 } 176 177 func (c *caseQuorum) TestCase() rpcpb.Case { 178 return c.rpcpbCase 179 } 180 181 func pickQuorum(size int) (picked map[int]struct{}) { 182 picked = make(map[int]struct{}) 183 r := rand.New(rand.NewSource(time.Now().UnixNano())) 184 quorum := size/2 + 1 185 for len(picked) < quorum { 186 idx := r.Intn(size) 187 picked[idx] = struct{}{} 188 } 189 return picked 190 } 191 192 type caseAll caseByFunc 193 194 func (c *caseAll) Inject(clus *Cluster) error { 195 for i := range clus.Members { 196 if err := c.injectMember(clus, i); err != nil { 197 return err 198 } 199 } 200 return nil 201 } 202 203 func (c *caseAll) Recover(clus *Cluster) error { 204 for i := range clus.Members { 205 if err := c.recoverMember(clus, i); err != nil { 206 return err 207 } 208 } 209 return nil 210 } 211 212 func (c *caseAll) Desc() string { 213 if c.desc != "" { 214 return c.desc 215 } 216 return c.rpcpbCase.String() 217 } 218 219 func (c *caseAll) TestCase() rpcpb.Case { 220 return c.rpcpbCase 221 } 222 223 // caseUntilSnapshot injects a failure/test and waits for a snapshot event 224 type caseUntilSnapshot struct { 225 desc string 226 rpcpbCase rpcpb.Case 227 Case 228 } 229 230 // all delay failure cases except the ones failing with latency 231 // greater than election timeout (trigger leader election and 232 // cluster keeps operating anyways) 233 var slowCases = map[rpcpb.Case]bool{ 234 rpcpb.Case_RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER: true, 235 rpcpb.Case_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT: true, 236 rpcpb.Case_RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT: true, 237 rpcpb.Case_RANDOM_DELAY_PEER_PORT_TX_RX_LEADER: true, 238 rpcpb.Case_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT: true, 239 rpcpb.Case_RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT: true, 240 rpcpb.Case_RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM: true, 241 rpcpb.Case_RANDOM_DELAY_PEER_PORT_TX_RX_ALL: true, 242 } 243 244 func (c *caseUntilSnapshot) Inject(clus *Cluster) error { 245 if err := c.Case.Inject(clus); err != nil { 246 return err 247 } 248 249 snapshotCount := clus.Members[0].Etcd.SnapshotCount 250 251 now := time.Now() 252 clus.lg.Info( 253 "trigger snapshot START", 254 zap.String("desc", c.Desc()), 255 zap.Int64("etcd-snapshot-count", snapshotCount), 256 ) 257 258 // maxRev may fail since failure just injected, retry if failed. 259 startRev, err := clus.maxRev() 260 for i := 0; i < 10 && startRev == 0; i++ { 261 startRev, err = clus.maxRev() 262 } 263 if startRev == 0 { 264 return err 265 } 266 lastRev := startRev 267 268 // healthy cluster could accept 1000 req/sec at least. 269 // 3x time to trigger snapshot. 270 retries := int(snapshotCount) / 1000 * 3 271 if v, ok := slowCases[c.TestCase()]; v && ok { 272 // slow network takes more retries 273 retries *= 5 274 } 275 276 for i := 0; i < retries; i++ { 277 lastRev, err = clus.maxRev() 278 if lastRev == 0 { 279 clus.lg.Info( 280 "trigger snapshot RETRY", 281 zap.Int("retries", i), 282 zap.Int64("etcd-snapshot-count", snapshotCount), 283 zap.Int64("start-revision", startRev), 284 zap.Error(err), 285 ) 286 time.Sleep(3 * time.Second) 287 continue 288 } 289 290 // If the number of proposals committed is bigger than snapshot count, 291 // a new snapshot should have been created. 292 diff := lastRev - startRev 293 if diff > snapshotCount { 294 clus.lg.Info( 295 "trigger snapshot PASS", 296 zap.Int("retries", i), 297 zap.String("desc", c.Desc()), 298 zap.Int64("committed-entries", diff), 299 zap.Int64("etcd-snapshot-count", snapshotCount), 300 zap.Int64("start-revision", startRev), 301 zap.Int64("last-revision", lastRev), 302 zap.Duration("took", time.Since(now)), 303 ) 304 return nil 305 } 306 307 clus.lg.Info( 308 "trigger snapshot RETRY", 309 zap.Int("retries", i), 310 zap.Int64("committed-entries", diff), 311 zap.Int64("etcd-snapshot-count", snapshotCount), 312 zap.Int64("start-revision", startRev), 313 zap.Int64("last-revision", lastRev), 314 zap.Duration("took", time.Since(now)), 315 zap.Error(err), 316 ) 317 time.Sleep(time.Second) 318 if err != nil { 319 time.Sleep(2 * time.Second) 320 } 321 } 322 323 return fmt.Errorf("cluster too slow: only %d commits in %d retries", lastRev-startRev, retries) 324 } 325 326 func (c *caseUntilSnapshot) Desc() string { 327 if c.desc != "" { 328 return c.desc 329 } 330 if c.rpcpbCase.String() != "" { 331 return c.rpcpbCase.String() 332 } 333 return c.Case.Desc() 334 } 335 336 func (c *caseUntilSnapshot) TestCase() rpcpb.Case { 337 return c.rpcpbCase 338 }