github.com/pingcap/chaos@v0.0.0-20190710112158-c86faf4b3719/pkg/control/control.go (about) 1 package control 2 3 import ( 4 "context" 5 "fmt" 6 "log" 7 "sync" 8 "sync/atomic" 9 "time" 10 11 "github.com/pingcap/chaos/pkg/core" 12 "github.com/pingcap/chaos/pkg/history" 13 "github.com/pingcap/chaos/pkg/verify" 14 15 // register nemesis 16 _ "github.com/pingcap/chaos/pkg/nemesis" 17 18 // register tidb 19 _ "github.com/pingcap/chaos/db/tidb" 20 ) 21 22 // Controller controls the whole cluster. It sends request to the database, 23 // and also uses nemesis to disturb the cluster. 24 // Here have only 5 nodes, and the hosts are n1 - n5. 25 type Controller struct { 26 cfg *Config 27 28 clients []core.Client 29 30 nemesisGenerators []core.NemesisGenerator 31 32 ctx context.Context 33 cancel context.CancelFunc 34 35 proc int64 36 requestCount int64 37 38 suit verify.Suit 39 } 40 41 // NewController creates a controller. 42 func NewController( 43 ctx context.Context, 44 cfg *Config, 45 clientCreator core.ClientCreator, 46 nemesisGenerators []core.NemesisGenerator, 47 verifySuit verify.Suit, 48 ) *Controller { 49 cfg.adjust() 50 51 if len(cfg.DB) == 0 { 52 log.Fatalf("empty database") 53 } 54 55 if db := core.GetDB(cfg.DB); db == nil { 56 log.Fatalf("database %s is not registered", cfg.DB) 57 } 58 59 c := new(Controller) 60 c.cfg = cfg 61 c.ctx, c.cancel = context.WithCancel(ctx) 62 c.nemesisGenerators = nemesisGenerators 63 c.suit = verifySuit 64 65 for _, node := range c.cfg.Nodes { 66 c.clients = append(c.clients, clientCreator.Create(node)) 67 } 68 69 log.Printf("start controller with %+v", cfg) 70 71 return c 72 } 73 74 // Close closes the controller. 75 func (c *Controller) Close() { 76 c.cancel() 77 } 78 79 // Run runs the controller. 80 func (c *Controller) Run() { 81 c.setUpDB() 82 c.setUpClient() 83 84 nctx, ncancel := context.WithTimeout(c.ctx, c.cfg.RunTime*time.Duration(int64(c.cfg.RunRound))) 85 var nemesisWg sync.WaitGroup 86 nemesisWg.Add(1) 87 go func() { 88 defer nemesisWg.Done() 89 c.dispatchNemesis(nctx) 90 }() 91 92 ROUND: 93 for round := 1; round <= c.cfg.RunRound; round++ { 94 log.Printf("round %d start ...", round) 95 96 ctx, cancel := context.WithTimeout(c.ctx, c.cfg.RunTime) 97 98 historyFile := fmt.Sprintf("%s.%d", c.cfg.History, round) 99 recorder, err := history.NewRecorder(historyFile) 100 if err != nil { 101 log.Fatalf("prepare history failed %v", err) 102 } 103 104 if err := c.dumpState(ctx, recorder); err != nil { 105 log.Fatalf("dump state failed %v", err) 106 } 107 108 // requestCount for the round, shared by all clients. 109 requestCount := int64(c.cfg.RequestCount) 110 log.Printf("total request count %d", requestCount) 111 112 n := len(c.cfg.Nodes) 113 var clientWg sync.WaitGroup 114 clientWg.Add(n) 115 for i := 0; i < n; i++ { 116 go func(i int) { 117 defer clientWg.Done() 118 c.onClientLoop(ctx, i, &requestCount, recorder) 119 }(i) 120 } 121 122 clientWg.Wait() 123 cancel() 124 125 recorder.Close() 126 c.suit.Verify(historyFile) 127 128 select { 129 case <-c.ctx.Done(): 130 log.Printf("finish test") 131 break ROUND 132 default: 133 } 134 135 log.Printf("round %d finish", round) 136 } 137 138 ncancel() 139 nemesisWg.Wait() 140 141 c.tearDownClient() 142 c.tearDownDB() 143 } 144 145 func (c *Controller) syncExec(f func(i int)) { 146 var wg sync.WaitGroup 147 n := len(c.cfg.Nodes) 148 wg.Add(n) 149 for i := 0; i < n; i++ { 150 go func(i int) { 151 defer wg.Done() 152 f(i) 153 }(i) 154 } 155 wg.Wait() 156 } 157 158 func (c *Controller) setUpDB() { 159 log.Printf("begin to set up database") 160 c.syncExec(func(i int) { 161 log.Printf("begin to set up database on %s", c.cfg.Nodes[i]) 162 db := core.GetDB(c.cfg.DB) 163 err := db.SetUp(c.ctx, c.cfg.Nodes, c.cfg.Nodes[i]) 164 if err != nil { 165 log.Fatalf("setup db %s at node %s failed %v", c.cfg.DB, c.cfg.Nodes[i], err) 166 } 167 }) 168 } 169 170 func (c *Controller) tearDownDB() { 171 log.Printf("begin to tear down database") 172 c.syncExec(func(i int) { 173 log.Printf("being to tear down database on %s", c.cfg.Nodes[i]) 174 db := core.GetDB(c.cfg.DB) 175 if err := db.TearDown(c.ctx, c.cfg.Nodes, c.cfg.Nodes[i]); err != nil { 176 log.Printf("tear down db %s at node %s failed %v", c.cfg.DB, c.cfg.Nodes[i], err) 177 } 178 }) 179 } 180 181 func (c *Controller) setUpClient() { 182 log.Printf("begin to set up client") 183 c.syncExec(func(i int) { 184 client := c.clients[i] 185 node := c.cfg.Nodes[i] 186 log.Printf("begin to set up db client for node %s", node) 187 if err := client.SetUp(c.ctx, c.cfg.Nodes, node); err != nil { 188 log.Fatalf("set up db client for node %s failed %v", node, err) 189 } 190 }) 191 } 192 193 func (c *Controller) tearDownClient() { 194 log.Printf("begin to tear down client") 195 c.syncExec(func(i int) { 196 client := c.clients[i] 197 node := c.cfg.Nodes[i] 198 log.Printf("begin to tear down db client for node %s", node) 199 if err := client.TearDown(c.ctx, c.cfg.Nodes, node); err != nil { 200 log.Printf("tear down db client for node %s failed %v", node, err) 201 } 202 }) 203 } 204 205 func (c *Controller) dumpState(ctx context.Context, recorder *history.Recorder) error { 206 ctx, cancel := context.WithCancel(c.ctx) 207 defer cancel() 208 209 for _, client := range c.clients { 210 for _, node := range c.cfg.Nodes { 211 log.Printf("begin to dump on node %s", node) 212 sum, err := client.DumpState(ctx) 213 if err == nil { 214 recorder.RecordState(sum) 215 return nil 216 } 217 } 218 } 219 return fmt.Errorf("fail to dump") 220 } 221 222 func (c *Controller) onClientLoop( 223 ctx context.Context, 224 i int, 225 requestCount *int64, 226 recorder *history.Recorder, 227 ) { 228 client := c.clients[i] 229 node := c.cfg.Nodes[i] 230 231 log.Printf("begin to run command on node %s", node) 232 233 ctx, cancel := context.WithCancel(ctx) 234 defer cancel() 235 236 procID := atomic.AddInt64(&c.proc, 1) 237 for atomic.AddInt64(requestCount, -1) >= 0 { 238 request := client.NextRequest() 239 240 if err := recorder.RecordRequest(procID, request); err != nil { 241 log.Fatalf("record request %v failed %v", request, err) 242 } 243 244 log.Printf("%s: call %+v", node, request) 245 response := client.Invoke(ctx, node, request) 246 log.Printf("%s: return %+v", node, response) 247 isUnknown := true 248 if v, ok := response.(core.UnknownResponse); ok { 249 isUnknown = v.IsUnknown() 250 } 251 252 if err := recorder.RecordResponse(procID, response); err != nil { 253 log.Fatalf("record response %v failed %v", response, err) 254 } 255 256 // If Unknown, we need to use another process ID. 257 if isUnknown { 258 procID = atomic.AddInt64(&c.proc, 1) 259 } 260 261 select { 262 case <-ctx.Done(): 263 return 264 default: 265 } 266 } 267 } 268 269 func (c *Controller) dispatchNemesis(ctx context.Context) { 270 if len(c.nemesisGenerators) == 0 { 271 return 272 } 273 274 log.Printf("begin to run nemesis") 275 var wg sync.WaitGroup 276 n := len(c.cfg.Nodes) 277 LOOP: 278 for { 279 for _, g := range c.nemesisGenerators { 280 select { 281 case <-ctx.Done(): 282 break LOOP 283 default: 284 } 285 286 log.Printf("begin to run %s nemesis generator", g.Name()) 287 ops := g.Generate(c.cfg.Nodes) 288 289 wg.Add(n) 290 for i := 0; i < n; i++ { 291 go c.onNemesisLoop(ctx, i, ops[i], &wg) 292 } 293 wg.Wait() 294 } 295 } 296 log.Printf("stop to run nemesis") 297 } 298 299 func (c *Controller) onNemesisLoop(ctx context.Context, index int, op *core.NemesisOperation, wg *sync.WaitGroup) { 300 defer wg.Done() 301 302 if op == nil { 303 return 304 } 305 306 nemesis := core.GetNemesis(op.Name) 307 if nemesis == nil { 308 log.Printf("nemesis %s is not registered", op.Name) 309 return 310 } 311 312 node := c.cfg.Nodes[index] 313 314 log.Printf("run nemesis %s on %s", op.Name, node) 315 if err := nemesis.Invoke(ctx, node, op.InvokeArgs...); err != nil { 316 log.Printf("run nemesis %s on %s failed: %v", op.Name, node, err) 317 } 318 319 select { 320 case <-time.After(op.RunTime): 321 case <-ctx.Done(): 322 } 323 if err := nemesis.Recover(ctx, node, op.RecoverArgs...); err != nil { 324 log.Printf("run nemesis %s on %s failed: %v", op.Name, node, err) 325 } 326 }