github.com/lfch/etcd-io/tests/v3@v3.0.0-20221004140520-eac99acd3e9d/functional/tester/cluster_run.go (about) 1 // Copyright 2018 The etcd Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tester 16 17 import ( 18 "fmt" 19 "os" 20 "testing" 21 "time" 22 23 "github.com/lfch/etcd-io/client/pkg/v3/fileutil" 24 "github.com/lfch/etcd-io/tests/v3/functional/rpcpb" 25 26 "go.uber.org/zap" 27 ) 28 29 // compactQPS is rough number of compact requests per second. 30 // Previous tests showed etcd can compact about 60,000 entries per second. 31 const compactQPS = 50000 32 33 // Run starts tester. 34 func (clus *Cluster) Run(t *testing.T) { 35 defer printReport() 36 37 // updateCases must be executed after etcd is started, because the FAILPOINTS case 38 // needs to obtain all the failpoints from the etcd member. 39 clus.updateCases() 40 41 if err := fileutil.TouchDirAll(clus.lg, clus.Tester.DataDir); err != nil { 42 clus.lg.Panic( 43 "failed to create test data directory", 44 zap.String("dir", clus.Tester.DataDir), 45 zap.Error(err), 46 ) 47 } 48 49 var preModifiedKey int64 50 for round := 0; round < int(clus.Tester.RoundLimit) || clus.Tester.RoundLimit == -1; round++ { 51 t.Run(fmt.Sprintf("round:%v", round), func(t *testing.T) { 52 preModifiedKey = clus.doRoundAndCompact(t, round, preModifiedKey) 53 }) 54 55 if round > 0 && round%500 == 0 { // every 500 rounds 56 t.Logf("Defragmenting in round: %v", round) 57 if err := clus.defrag(); err != nil { 58 clus.failed(err) 59 return 60 } 61 } 62 } 63 64 clus.lg.Info( 65 "functional-tester PASS", 66 zap.Int("round", clus.rd), 67 zap.Int("case", clus.cs), 68 zap.Int("case-total", len(clus.cases)), 69 ) 70 } 71 72 func (clus *Cluster) doRoundAndCompact(t *testing.T, round int, preModifiedKey int64) (postModifiedKey int64) { 73 roundTotalCounter.Inc() 74 clus.rd = round 75 76 if err := clus.doRound(t); err != nil { 77 clus.lg.Error( 78 "round FAIL", 79 zap.Int("round", clus.rd), 80 zap.Int("case", clus.cs), 81 zap.Int("case-total", len(clus.cases)), 82 zap.Error(err), 83 ) 84 if clus.cleanup(err) != nil { 85 return 86 } 87 // reset preModifiedKey after clean up 88 postModifiedKey = 0 89 return 90 } 91 92 // -1 so that logPrefix doesn't print out 'case' 93 clus.cs = -1 94 95 revToCompact := max(0, clus.currentRevision-10000) 96 currentModifiedKey := clus.stresser.ModifiedKeys() 97 modifiedKey := currentModifiedKey - preModifiedKey 98 timeout := 10 * time.Second 99 timeout += time.Duration(modifiedKey/compactQPS) * time.Second 100 clus.lg.Info( 101 "compact START", 102 zap.Int("round", clus.rd), 103 zap.Int("case", clus.cs), 104 zap.Int("case-total", len(clus.cases)), 105 zap.Duration("timeout", timeout), 106 ) 107 if err := clus.compact(revToCompact, timeout); err != nil { 108 clus.lg.Warn( 109 "compact FAIL", 110 zap.Int("round", clus.rd), 111 zap.Int("case", clus.cs), 112 zap.Int("case-total", len(clus.cases)), 113 zap.Error(err), 114 ) 115 if err = clus.cleanup(err); err != nil { 116 clus.lg.Warn( 117 "cleanup FAIL", 118 zap.Int("round", clus.rd), 119 zap.Int("case", clus.cs), 120 zap.Int("case-total", len(clus.cases)), 121 zap.Error(err), 122 ) 123 return 124 } 125 // reset preModifiedKey after clean up 126 return 0 127 } 128 return currentModifiedKey 129 } 130 131 func (clus *Cluster) doRound(t *testing.T) error { 132 if clus.Tester.CaseShuffle { 133 clus.shuffleCases() 134 } 135 136 roundNow := time.Now() 137 clus.lg.Info( 138 "round START", 139 zap.Int("round", clus.rd), 140 zap.Int("case-total", len(clus.cases)), 141 zap.Strings("cases", clus.listCases()), 142 ) 143 for i, fa := range clus.cases { 144 clus.cs = i 145 t.Run(fmt.Sprintf("%v_%s", i, fa.TestCase()), 146 func(t *testing.T) { 147 clus.doTestCase(t, fa) 148 }) 149 } 150 151 clus.lg.Info( 152 "round ALL PASS", 153 zap.Int("round", clus.rd), 154 zap.Strings("cases", clus.listCases()), 155 zap.Int("case-total", len(clus.cases)), 156 zap.Duration("took", time.Since(roundNow)), 157 ) 158 return nil 159 } 160 161 func (clus *Cluster) doTestCase(t *testing.T, fa Case) { 162 caseTotal[fa.Desc()]++ 163 caseTotalCounter.WithLabelValues(fa.Desc()).Inc() 164 165 caseNow := time.Now() 166 clus.lg.Info( 167 "case START", 168 zap.Int("round", clus.rd), 169 zap.Int("case", clus.cs), 170 zap.Int("case-total", len(clus.cases)), 171 zap.String("desc", fa.Desc()), 172 ) 173 174 clus.lg.Info("wait health before injecting failures") 175 if err := clus.WaitHealth(); err != nil { 176 t.Fatalf("wait full health error: %v", err) 177 } 178 179 stressStarted := false 180 fcase := fa.TestCase() 181 if fcase != rpcpb.Case_NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS { 182 clus.lg.Info( 183 "stress START", 184 zap.Int("round", clus.rd), 185 zap.Int("case", clus.cs), 186 zap.Int("case-total", len(clus.cases)), 187 zap.String("desc", fa.Desc()), 188 ) 189 if err := clus.stresser.Stress(); err != nil { 190 t.Fatalf("start stresser error: %v", err) 191 } 192 stressStarted = true 193 } 194 195 clus.lg.Info( 196 "inject START", 197 zap.Int("round", clus.rd), 198 zap.Int("case", clus.cs), 199 zap.Int("case-total", len(clus.cases)), 200 zap.String("desc", fa.Desc()), 201 ) 202 if err := fa.Inject(clus); err != nil { 203 t.Fatalf("injection error: %v", err) 204 } 205 206 // if run local, recovering server may conflict 207 // with stressing client ports 208 // TODO: use unix for local tests 209 clus.lg.Info( 210 "recover START", 211 zap.Int("round", clus.rd), 212 zap.Int("case", clus.cs), 213 zap.Int("case-total", len(clus.cases)), 214 zap.String("desc", fa.Desc()), 215 ) 216 if err := fa.Recover(clus); err != nil { 217 t.Fatalf("recovery error: %v", err) 218 } 219 220 if stressStarted { 221 clus.lg.Info( 222 "stress PAUSE", 223 zap.Int("round", clus.rd), 224 zap.Int("case", clus.cs), 225 zap.Int("case-total", len(clus.cases)), 226 zap.String("desc", fa.Desc()), 227 ) 228 ems := clus.stresser.Pause() 229 if fcase == rpcpb.Case_NO_FAIL_WITH_STRESS && len(ems) > 0 { 230 ess := make([]string, 0, len(ems)) 231 cnt := 0 232 for k, v := range ems { 233 ess = append(ess, fmt.Sprintf("%s (count: %d)", k, v)) 234 cnt += v 235 } 236 clus.lg.Warn( 237 "expected no errors", 238 zap.String("desc", fa.Desc()), 239 zap.Strings("errors", ess), 240 ) 241 242 // with network delay, some ongoing requests may fail 243 // only return error, if more than 30% of QPS requests fail 244 if cnt > int(float64(clus.Tester.StressQPS)*0.3) { 245 t.Fatalf("expected no error in %q, got %q", fcase.String(), ess) 246 } 247 } 248 } 249 250 clus.lg.Info( 251 "health check START", 252 zap.Int("round", clus.rd), 253 zap.Int("case", clus.cs), 254 zap.Int("case-total", len(clus.cases)), 255 zap.String("desc", fa.Desc()), 256 ) 257 if err := clus.WaitHealth(); err != nil { 258 t.Fatalf("wait full health error: %v", err) 259 } 260 261 var checkerFailExceptions []rpcpb.Checker 262 switch fcase { 263 case rpcpb.Case_SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_LEADER_SNAPSHOT_FROM_SCRATCH: 264 // TODO: restore from snapshot 265 checkerFailExceptions = append(checkerFailExceptions, rpcpb.Checker_LEASE_EXPIRE) 266 } 267 268 clus.lg.Info( 269 "consistency check START", 270 zap.Int("round", clus.rd), 271 zap.Int("case", clus.cs), 272 zap.Int("case-total", len(clus.cases)), 273 zap.String("desc", fa.Desc()), 274 ) 275 if err := clus.runCheckers(checkerFailExceptions...); err != nil { 276 t.Fatalf("consistency check error (%v)", err) 277 } 278 clus.lg.Info( 279 "consistency check PASS", 280 zap.Int("round", clus.rd), 281 zap.Int("case", clus.cs), 282 zap.Int("case-total", len(clus.cases)), 283 zap.String("desc", fa.Desc()), 284 zap.Duration("took", time.Since(caseNow)), 285 ) 286 } 287 288 func (clus *Cluster) updateRevision() error { 289 revs, _, err := clus.getRevisionHash() 290 for _, rev := range revs { 291 clus.currentRevision = rev 292 break // just need get one of the current revisions 293 } 294 295 clus.lg.Info( 296 "updated current revision", 297 zap.Int64("current-revision", clus.currentRevision), 298 ) 299 return err 300 } 301 302 func (clus *Cluster) compact(rev int64, timeout time.Duration) (err error) { 303 if err = clus.compactKV(rev, timeout); err != nil { 304 clus.lg.Warn( 305 "compact FAIL", 306 zap.Int64("current-revision", clus.currentRevision), 307 zap.Int64("compact-revision", rev), 308 zap.Error(err), 309 ) 310 return err 311 } 312 clus.lg.Info( 313 "compact DONE", 314 zap.Int64("current-revision", clus.currentRevision), 315 zap.Int64("compact-revision", rev), 316 ) 317 318 if err = clus.checkCompact(rev); err != nil { 319 clus.lg.Warn( 320 "check compact FAIL", 321 zap.Int64("current-revision", clus.currentRevision), 322 zap.Int64("compact-revision", rev), 323 zap.Error(err), 324 ) 325 return err 326 } 327 clus.lg.Info( 328 "check compact DONE", 329 zap.Int64("current-revision", clus.currentRevision), 330 zap.Int64("compact-revision", rev), 331 ) 332 333 return nil 334 } 335 336 func (clus *Cluster) failed(err error) { 337 clus.lg.Error( 338 "functional-tester FAIL", 339 zap.Int("round", clus.rd), 340 zap.Int("case", clus.cs), 341 zap.Int("case-total", len(clus.cases)), 342 zap.Error(err), 343 ) 344 clus.Send_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT() 345 346 os.Exit(2) 347 } 348 349 func (clus *Cluster) cleanup(err error) error { 350 if clus.Tester.ExitOnCaseFail { 351 defer clus.failed(err) 352 } 353 354 roundFailedTotalCounter.Inc() 355 desc := "compact/defrag" 356 if clus.cs != -1 { 357 desc = clus.cases[clus.cs].Desc() 358 } 359 caseFailedTotalCounter.WithLabelValues(desc).Inc() 360 361 clus.lg.Info( 362 "closing stressers before archiving failure data", 363 zap.Int("round", clus.rd), 364 zap.Int("case", clus.cs), 365 zap.Int("case-total", len(clus.cases)), 366 ) 367 clus.stresser.Close() 368 369 if err := clus.send_SIGQUIT_ETCD_AND_ARCHIVE_DATA(); err != nil { 370 clus.lg.Warn( 371 "cleanup FAIL", 372 zap.Int("round", clus.rd), 373 zap.Int("case", clus.cs), 374 zap.Int("case-total", len(clus.cases)), 375 zap.Error(err), 376 ) 377 return err 378 } 379 if err := clus.send_RESTART_ETCD(); err != nil { 380 clus.lg.Warn( 381 "restart FAIL", 382 zap.Int("round", clus.rd), 383 zap.Int("case", clus.cs), 384 zap.Int("case-total", len(clus.cases)), 385 zap.Error(err), 386 ) 387 return err 388 } 389 390 clus.setStresserChecker() 391 return nil 392 }