go.etcd.io/etcd@v3.3.27+incompatible/functional/tester/cluster_run.go (about) 1 // Copyright 2018 The etcd Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tester 16 17 import ( 18 "fmt" 19 "os" 20 "time" 21 22 "github.com/coreos/etcd/functional/rpcpb" 23 "github.com/coreos/etcd/pkg/fileutil" 24 25 "go.uber.org/zap" 26 ) 27 28 // compactQPS is rough number of compact requests per second. 29 // Previous tests showed etcd can compact about 60,000 entries per second. 30 const compactQPS = 50000 31 32 // Run starts tester. 33 func (clus *Cluster) Run() { 34 defer printReport() 35 36 if err := fileutil.TouchDirAll(clus.Tester.DataDir); err != nil { 37 clus.lg.Panic( 38 "failed to create test data directory", 39 zap.String("dir", clus.Tester.DataDir), 40 zap.Error(err), 41 ) 42 } 43 44 var preModifiedKey int64 45 for round := 0; round < int(clus.Tester.RoundLimit) || clus.Tester.RoundLimit == -1; round++ { 46 roundTotalCounter.Inc() 47 clus.rd = round 48 49 if err := clus.doRound(); err != nil { 50 clus.lg.Warn( 51 "round FAIL", 52 zap.Int("round", clus.rd), 53 zap.Int("case", clus.cs), 54 zap.Int("case-total", len(clus.cases)), 55 zap.Error(err), 56 ) 57 if clus.cleanup() != nil { 58 return 59 } 60 // reset preModifiedKey after clean up 61 preModifiedKey = 0 62 continue 63 } 64 65 // -1 so that logPrefix doesn't print out 'case' 66 clus.cs = -1 67 68 revToCompact := max(0, clus.currentRevision-10000) 69 currentModifiedKey := clus.stresser.ModifiedKeys() 70 modifiedKey := currentModifiedKey - preModifiedKey 71 preModifiedKey = currentModifiedKey 72 timeout := 10 * time.Second 73 timeout += time.Duration(modifiedKey/compactQPS) * time.Second 74 clus.lg.Info( 75 "compact START", 76 zap.Int("round", clus.rd), 77 zap.Int("case", clus.cs), 78 zap.Int("case-total", len(clus.cases)), 79 zap.Duration("timeout", timeout), 80 ) 81 if err := clus.compact(revToCompact, timeout); err != nil { 82 clus.lg.Warn( 83 "compact FAIL", 84 zap.Int("round", clus.rd), 85 zap.Int("case", clus.cs), 86 zap.Int("case-total", len(clus.cases)), 87 zap.Error(err), 88 ) 89 if err = clus.cleanup(); err != nil { 90 clus.lg.Warn( 91 "cleanup FAIL", 92 zap.Int("round", clus.rd), 93 zap.Int("case", clus.cs), 94 zap.Int("case-total", len(clus.cases)), 95 zap.Error(err), 96 ) 97 return 98 } 99 // reset preModifiedKey after clean up 100 preModifiedKey = 0 101 } 102 if round > 0 && round%500 == 0 { // every 500 rounds 103 if err := clus.defrag(); err != nil { 104 clus.failed() 105 return 106 } 107 } 108 } 109 110 clus.lg.Info( 111 "functional-tester PASS", 112 zap.Int("round", clus.rd), 113 zap.Int("case", clus.cs), 114 zap.Int("case-total", len(clus.cases)), 115 ) 116 } 117 118 func (clus *Cluster) doRound() error { 119 if clus.Tester.CaseShuffle { 120 clus.shuffleCases() 121 } 122 123 roundNow := time.Now() 124 clus.lg.Info( 125 "round START", 126 zap.Int("round", clus.rd), 127 zap.Int("case", clus.cs), 128 zap.Int("case-total", len(clus.cases)), 129 zap.Strings("cases", clus.listCases()), 130 ) 131 for i, fa := range clus.cases { 132 clus.cs = i 133 134 caseTotal[fa.Desc()]++ 135 caseTotalCounter.WithLabelValues(fa.Desc()).Inc() 136 137 caseNow := time.Now() 138 clus.lg.Info( 139 "case START", 140 zap.Int("round", clus.rd), 141 zap.Int("case", clus.cs), 142 zap.Int("case-total", len(clus.cases)), 143 zap.String("desc", fa.Desc()), 144 ) 145 146 clus.lg.Info("wait health before injecting failures") 147 if err := clus.WaitHealth(); err != nil { 148 return fmt.Errorf("wait full health error: %v", err) 149 } 150 151 stressStarted := false 152 fcase := fa.TestCase() 153 if fcase != rpcpb.Case_NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS { 154 clus.lg.Info( 155 "stress START", 156 zap.Int("round", clus.rd), 157 zap.Int("case", clus.cs), 158 zap.Int("case-total", len(clus.cases)), 159 zap.String("desc", fa.Desc()), 160 ) 161 if err := clus.stresser.Stress(); err != nil { 162 return fmt.Errorf("start stresser error: %v", err) 163 } 164 stressStarted = true 165 } 166 167 clus.lg.Info( 168 "inject START", 169 zap.Int("round", clus.rd), 170 zap.Int("case", clus.cs), 171 zap.Int("case-total", len(clus.cases)), 172 zap.String("desc", fa.Desc()), 173 ) 174 if err := fa.Inject(clus); err != nil { 175 return fmt.Errorf("injection error: %v", err) 176 } 177 178 // if run local, recovering server may conflict 179 // with stressing client ports 180 // TODO: use unix for local tests 181 clus.lg.Info( 182 "recover START", 183 zap.Int("round", clus.rd), 184 zap.Int("case", clus.cs), 185 zap.Int("case-total", len(clus.cases)), 186 zap.String("desc", fa.Desc()), 187 ) 188 if err := fa.Recover(clus); err != nil { 189 return fmt.Errorf("recovery error: %v", err) 190 } 191 192 if stressStarted { 193 clus.lg.Info( 194 "stress PAUSE", 195 zap.Int("round", clus.rd), 196 zap.Int("case", clus.cs), 197 zap.Int("case-total", len(clus.cases)), 198 zap.String("desc", fa.Desc()), 199 ) 200 ems := clus.stresser.Pause() 201 if fcase == rpcpb.Case_NO_FAIL_WITH_STRESS && len(ems) > 0 { 202 ess := make([]string, 0, len(ems)) 203 cnt := 0 204 for k, v := range ems { 205 ess = append(ess, fmt.Sprintf("%s (count: %d)", k, v)) 206 cnt += v 207 } 208 clus.lg.Warn( 209 "expected no errors", 210 zap.String("desc", fa.Desc()), 211 zap.Strings("errors", ess), 212 ) 213 214 // with network delay, some ongoing requests may fail 215 // only return error, if more than 10% of QPS requests fail 216 if cnt > int(clus.Tester.StressQPS)/10 { 217 return fmt.Errorf("expected no error in %q, got %q", fcase.String(), ess) 218 } 219 } 220 } 221 222 clus.lg.Info( 223 "health check START", 224 zap.Int("round", clus.rd), 225 zap.Int("case", clus.cs), 226 zap.Int("case-total", len(clus.cases)), 227 zap.String("desc", fa.Desc()), 228 ) 229 if err := clus.WaitHealth(); err != nil { 230 return fmt.Errorf("wait full health error: %v", err) 231 } 232 233 checkerFailExceptions := []rpcpb.Checker{} 234 switch fcase { 235 case rpcpb.Case_SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_LEADER_SNAPSHOT_FROM_SCRATCH: 236 // TODO: restore from snapshot 237 checkerFailExceptions = append(checkerFailExceptions, rpcpb.Checker_LEASE_EXPIRE) 238 } 239 240 clus.lg.Info( 241 "consistency check START", 242 zap.Int("round", clus.rd), 243 zap.Int("case", clus.cs), 244 zap.Int("case-total", len(clus.cases)), 245 zap.String("desc", fa.Desc()), 246 ) 247 if err := clus.runCheckers(checkerFailExceptions...); err != nil { 248 return fmt.Errorf("consistency check error (%v)", err) 249 } 250 clus.lg.Info( 251 "consistency check PASS", 252 zap.Int("round", clus.rd), 253 zap.Int("case", clus.cs), 254 zap.Int("case-total", len(clus.cases)), 255 zap.String("desc", fa.Desc()), 256 zap.Duration("took", time.Since(caseNow)), 257 ) 258 } 259 260 clus.lg.Info( 261 "round ALL PASS", 262 zap.Int("round", clus.rd), 263 zap.Strings("cases", clus.listCases()), 264 zap.Int("case-total", len(clus.cases)), 265 zap.Duration("took", time.Since(roundNow)), 266 ) 267 return nil 268 } 269 270 func (clus *Cluster) updateRevision() error { 271 revs, _, err := clus.getRevisionHash() 272 for _, rev := range revs { 273 clus.currentRevision = rev 274 break // just need get one of the current revisions 275 } 276 277 clus.lg.Info( 278 "updated current revision", 279 zap.Int64("current-revision", clus.currentRevision), 280 ) 281 return err 282 } 283 284 func (clus *Cluster) compact(rev int64, timeout time.Duration) (err error) { 285 if err = clus.compactKV(rev, timeout); err != nil { 286 clus.lg.Warn( 287 "compact FAIL", 288 zap.Int64("current-revision", clus.currentRevision), 289 zap.Int64("compact-revision", rev), 290 zap.Error(err), 291 ) 292 return err 293 } 294 clus.lg.Info( 295 "compact DONE", 296 zap.Int64("current-revision", clus.currentRevision), 297 zap.Int64("compact-revision", rev), 298 ) 299 300 if err = clus.checkCompact(rev); err != nil { 301 clus.lg.Warn( 302 "check compact FAIL", 303 zap.Int64("current-revision", clus.currentRevision), 304 zap.Int64("compact-revision", rev), 305 zap.Error(err), 306 ) 307 return err 308 } 309 clus.lg.Info( 310 "check compact DONE", 311 zap.Int64("current-revision", clus.currentRevision), 312 zap.Int64("compact-revision", rev), 313 ) 314 315 return nil 316 } 317 318 func (clus *Cluster) failed() { 319 clus.lg.Info( 320 "functional-tester FAIL", 321 zap.Int("round", clus.rd), 322 zap.Int("case", clus.cs), 323 zap.Int("case-total", len(clus.cases)), 324 ) 325 clus.Send_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT() 326 327 os.Exit(2) 328 } 329 330 func (clus *Cluster) cleanup() error { 331 if clus.Tester.ExitOnCaseFail { 332 defer clus.failed() 333 } 334 335 roundFailedTotalCounter.Inc() 336 desc := "compact/defrag" 337 if clus.cs != -1 { 338 desc = clus.cases[clus.cs].Desc() 339 } 340 caseFailedTotalCounter.WithLabelValues(desc).Inc() 341 342 clus.lg.Info( 343 "closing stressers before archiving failure data", 344 zap.Int("round", clus.rd), 345 zap.Int("case", clus.cs), 346 zap.Int("case-total", len(clus.cases)), 347 ) 348 clus.stresser.Close() 349 350 if err := clus.send_SIGQUIT_ETCD_AND_ARCHIVE_DATA(); err != nil { 351 clus.lg.Warn( 352 "cleanup FAIL", 353 zap.Int("round", clus.rd), 354 zap.Int("case", clus.cs), 355 zap.Int("case-total", len(clus.cases)), 356 zap.Error(err), 357 ) 358 return err 359 } 360 if err := clus.send_RESTART_ETCD(); err != nil { 361 clus.lg.Warn( 362 "restart FAIL", 363 zap.Int("round", clus.rd), 364 zap.Int("case", clus.cs), 365 zap.Int("case-total", len(clus.cases)), 366 zap.Error(err), 367 ) 368 return err 369 } 370 371 clus.setStresserChecker() 372 return nil 373 }