github.com/pingcap/chaos@v0.0.0-20190710112158-c86faf4b3719/pkg/control/control.go (about)

     1  package control
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"log"
     7  	"sync"
     8  	"sync/atomic"
     9  	"time"
    10  
    11  	"github.com/pingcap/chaos/pkg/core"
    12  	"github.com/pingcap/chaos/pkg/history"
    13  	"github.com/pingcap/chaos/pkg/verify"
    14  
    15  	// register nemesis
    16  	_ "github.com/pingcap/chaos/pkg/nemesis"
    17  
    18  	// register tidb
    19  	_ "github.com/pingcap/chaos/db/tidb"
    20  )
    21  
    22  // Controller controls the whole cluster. It sends request to the database,
    23  // and also uses nemesis to disturb the cluster.
    24  // Here have only 5 nodes, and the hosts are n1 - n5.
    25  type Controller struct {
    26  	cfg *Config
    27  
    28  	clients []core.Client
    29  
    30  	nemesisGenerators []core.NemesisGenerator
    31  
    32  	ctx    context.Context
    33  	cancel context.CancelFunc
    34  
    35  	proc         int64
    36  	requestCount int64
    37  
    38  	suit verify.Suit
    39  }
    40  
    41  // NewController creates a controller.
    42  func NewController(
    43  	ctx context.Context,
    44  	cfg *Config,
    45  	clientCreator core.ClientCreator,
    46  	nemesisGenerators []core.NemesisGenerator,
    47  	verifySuit verify.Suit,
    48  ) *Controller {
    49  	cfg.adjust()
    50  
    51  	if len(cfg.DB) == 0 {
    52  		log.Fatalf("empty database")
    53  	}
    54  
    55  	if db := core.GetDB(cfg.DB); db == nil {
    56  		log.Fatalf("database %s is not registered", cfg.DB)
    57  	}
    58  
    59  	c := new(Controller)
    60  	c.cfg = cfg
    61  	c.ctx, c.cancel = context.WithCancel(ctx)
    62  	c.nemesisGenerators = nemesisGenerators
    63  	c.suit = verifySuit
    64  
    65  	for _, node := range c.cfg.Nodes {
    66  		c.clients = append(c.clients, clientCreator.Create(node))
    67  	}
    68  
    69  	log.Printf("start controller with %+v", cfg)
    70  
    71  	return c
    72  }
    73  
    74  // Close closes the controller.
    75  func (c *Controller) Close() {
    76  	c.cancel()
    77  }
    78  
    79  // Run runs the controller.
    80  func (c *Controller) Run() {
    81  	c.setUpDB()
    82  	c.setUpClient()
    83  
    84  	nctx, ncancel := context.WithTimeout(c.ctx, c.cfg.RunTime*time.Duration(int64(c.cfg.RunRound)))
    85  	var nemesisWg sync.WaitGroup
    86  	nemesisWg.Add(1)
    87  	go func() {
    88  		defer nemesisWg.Done()
    89  		c.dispatchNemesis(nctx)
    90  	}()
    91  
    92  ROUND:
    93  	for round := 1; round <= c.cfg.RunRound; round++ {
    94  		log.Printf("round %d start ...", round)
    95  
    96  		ctx, cancel := context.WithTimeout(c.ctx, c.cfg.RunTime)
    97  
    98  		historyFile := fmt.Sprintf("%s.%d", c.cfg.History, round)
    99  		recorder, err := history.NewRecorder(historyFile)
   100  		if err != nil {
   101  			log.Fatalf("prepare history failed %v", err)
   102  		}
   103  
   104  		if err := c.dumpState(ctx, recorder); err != nil {
   105  			log.Fatalf("dump state failed %v", err)
   106  		}
   107  
   108  		// requestCount for the round, shared by all clients.
   109  		requestCount := int64(c.cfg.RequestCount)
   110  		log.Printf("total request count %d", requestCount)
   111  
   112  		n := len(c.cfg.Nodes)
   113  		var clientWg sync.WaitGroup
   114  		clientWg.Add(n)
   115  		for i := 0; i < n; i++ {
   116  			go func(i int) {
   117  				defer clientWg.Done()
   118  				c.onClientLoop(ctx, i, &requestCount, recorder)
   119  			}(i)
   120  		}
   121  
   122  		clientWg.Wait()
   123  		cancel()
   124  
   125  		recorder.Close()
   126  		c.suit.Verify(historyFile)
   127  
   128  		select {
   129  		case <-c.ctx.Done():
   130  			log.Printf("finish test")
   131  			break ROUND
   132  		default:
   133  		}
   134  
   135  		log.Printf("round %d finish", round)
   136  	}
   137  
   138  	ncancel()
   139  	nemesisWg.Wait()
   140  
   141  	c.tearDownClient()
   142  	c.tearDownDB()
   143  }
   144  
   145  func (c *Controller) syncExec(f func(i int)) {
   146  	var wg sync.WaitGroup
   147  	n := len(c.cfg.Nodes)
   148  	wg.Add(n)
   149  	for i := 0; i < n; i++ {
   150  		go func(i int) {
   151  			defer wg.Done()
   152  			f(i)
   153  		}(i)
   154  	}
   155  	wg.Wait()
   156  }
   157  
   158  func (c *Controller) setUpDB() {
   159  	log.Printf("begin to set up database")
   160  	c.syncExec(func(i int) {
   161  		log.Printf("begin to set up database on %s", c.cfg.Nodes[i])
   162  		db := core.GetDB(c.cfg.DB)
   163  		err := db.SetUp(c.ctx, c.cfg.Nodes, c.cfg.Nodes[i])
   164  		if err != nil {
   165  			log.Fatalf("setup db %s at node %s failed %v", c.cfg.DB, c.cfg.Nodes[i], err)
   166  		}
   167  	})
   168  }
   169  
   170  func (c *Controller) tearDownDB() {
   171  	log.Printf("begin to tear down database")
   172  	c.syncExec(func(i int) {
   173  		log.Printf("being to tear down database on %s", c.cfg.Nodes[i])
   174  		db := core.GetDB(c.cfg.DB)
   175  		if err := db.TearDown(c.ctx, c.cfg.Nodes, c.cfg.Nodes[i]); err != nil {
   176  			log.Printf("tear down db %s at node %s failed %v", c.cfg.DB, c.cfg.Nodes[i], err)
   177  		}
   178  	})
   179  }
   180  
   181  func (c *Controller) setUpClient() {
   182  	log.Printf("begin to set up client")
   183  	c.syncExec(func(i int) {
   184  		client := c.clients[i]
   185  		node := c.cfg.Nodes[i]
   186  		log.Printf("begin to set up db client for node %s", node)
   187  		if err := client.SetUp(c.ctx, c.cfg.Nodes, node); err != nil {
   188  			log.Fatalf("set up db client for node %s failed %v", node, err)
   189  		}
   190  	})
   191  }
   192  
   193  func (c *Controller) tearDownClient() {
   194  	log.Printf("begin to tear down client")
   195  	c.syncExec(func(i int) {
   196  		client := c.clients[i]
   197  		node := c.cfg.Nodes[i]
   198  		log.Printf("begin to tear down db client for node %s", node)
   199  		if err := client.TearDown(c.ctx, c.cfg.Nodes, node); err != nil {
   200  			log.Printf("tear down db client for node %s failed %v", node, err)
   201  		}
   202  	})
   203  }
   204  
   205  func (c *Controller) dumpState(ctx context.Context, recorder *history.Recorder) error {
   206  	ctx, cancel := context.WithCancel(c.ctx)
   207  	defer cancel()
   208  
   209  	for _, client := range c.clients {
   210  		for _, node := range c.cfg.Nodes {
   211  			log.Printf("begin to dump on node %s", node)
   212  			sum, err := client.DumpState(ctx)
   213  			if err == nil {
   214  				recorder.RecordState(sum)
   215  				return nil
   216  			}
   217  		}
   218  	}
   219  	return fmt.Errorf("fail to dump")
   220  }
   221  
   222  func (c *Controller) onClientLoop(
   223  	ctx context.Context,
   224  	i int,
   225  	requestCount *int64,
   226  	recorder *history.Recorder,
   227  ) {
   228  	client := c.clients[i]
   229  	node := c.cfg.Nodes[i]
   230  
   231  	log.Printf("begin to run command on node %s", node)
   232  
   233  	ctx, cancel := context.WithCancel(ctx)
   234  	defer cancel()
   235  
   236  	procID := atomic.AddInt64(&c.proc, 1)
   237  	for atomic.AddInt64(requestCount, -1) >= 0 {
   238  		request := client.NextRequest()
   239  
   240  		if err := recorder.RecordRequest(procID, request); err != nil {
   241  			log.Fatalf("record request %v failed %v", request, err)
   242  		}
   243  
   244  		log.Printf("%s: call %+v", node, request)
   245  		response := client.Invoke(ctx, node, request)
   246  		log.Printf("%s: return %+v", node, response)
   247  		isUnknown := true
   248  		if v, ok := response.(core.UnknownResponse); ok {
   249  			isUnknown = v.IsUnknown()
   250  		}
   251  
   252  		if err := recorder.RecordResponse(procID, response); err != nil {
   253  			log.Fatalf("record response %v failed %v", response, err)
   254  		}
   255  
   256  		// If Unknown, we need to use another process ID.
   257  		if isUnknown {
   258  			procID = atomic.AddInt64(&c.proc, 1)
   259  		}
   260  
   261  		select {
   262  		case <-ctx.Done():
   263  			return
   264  		default:
   265  		}
   266  	}
   267  }
   268  
   269  func (c *Controller) dispatchNemesis(ctx context.Context) {
   270  	if len(c.nemesisGenerators) == 0 {
   271  		return
   272  	}
   273  
   274  	log.Printf("begin to run nemesis")
   275  	var wg sync.WaitGroup
   276  	n := len(c.cfg.Nodes)
   277  LOOP:
   278  	for {
   279  		for _, g := range c.nemesisGenerators {
   280  			select {
   281  			case <-ctx.Done():
   282  				break LOOP
   283  			default:
   284  			}
   285  
   286  			log.Printf("begin to run %s nemesis generator", g.Name())
   287  			ops := g.Generate(c.cfg.Nodes)
   288  
   289  			wg.Add(n)
   290  			for i := 0; i < n; i++ {
   291  				go c.onNemesisLoop(ctx, i, ops[i], &wg)
   292  			}
   293  			wg.Wait()
   294  		}
   295  	}
   296  	log.Printf("stop to run nemesis")
   297  }
   298  
   299  func (c *Controller) onNemesisLoop(ctx context.Context, index int, op *core.NemesisOperation, wg *sync.WaitGroup) {
   300  	defer wg.Done()
   301  
   302  	if op == nil {
   303  		return
   304  	}
   305  
   306  	nemesis := core.GetNemesis(op.Name)
   307  	if nemesis == nil {
   308  		log.Printf("nemesis %s is not registered", op.Name)
   309  		return
   310  	}
   311  
   312  	node := c.cfg.Nodes[index]
   313  
   314  	log.Printf("run nemesis %s on %s", op.Name, node)
   315  	if err := nemesis.Invoke(ctx, node, op.InvokeArgs...); err != nil {
   316  		log.Printf("run nemesis %s on %s failed: %v", op.Name, node, err)
   317  	}
   318  
   319  	select {
   320  	case <-time.After(op.RunTime):
   321  	case <-ctx.Done():
   322  	}
   323  	if err := nemesis.Recover(ctx, node, op.RecoverArgs...); err != nil {
   324  		log.Printf("run nemesis %s on %s failed: %v", op.Name, node, err)
   325  	}
   326  }