github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/chaos/cases/task.go (about)

     1  // Copyright 2020 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package main
    15  
    16  import (
    17  	"context"
    18  	"database/sql"
    19  	"fmt"
    20  	"math/rand"
    21  	"strings"
    22  	"time"
    23  
    24  	"github.com/chaos-mesh/go-sqlsmith"
    25  	"github.com/pingcap/errors"
    26  	"github.com/pingcap/tidb/pkg/parser/mysql"
    27  	"github.com/pingcap/tidb/pkg/util/dbutil"
    28  	config2 "github.com/pingcap/tiflow/dm/config"
    29  	"github.com/pingcap/tiflow/dm/config/dbconfig"
    30  	"github.com/pingcap/tiflow/dm/pb"
    31  	"github.com/pingcap/tiflow/dm/pkg/conn"
    32  	"github.com/pingcap/tiflow/dm/pkg/log"
    33  	"go.uber.org/zap"
    34  	"golang.org/x/sync/errgroup"
    35  )
    36  
    37  const (
    38  	tableCount      = 10               // tables count in schema.
    39  	fullInsertCount = 100              // `INSERT INTO` count (not rows count) for each table in full stage.
    40  	diffCount       = 20               // diff data check count
    41  	diffInterval    = 20 * time.Second // diff data check interval
    42  	incrRoundTime   = 10 * time.Second // time to generate incremental data in one round
    43  )
    44  
    45  // task is a data migration task test case with one or more sources.
    46  type task struct {
    47  	logger log.Logger
    48  	ctx    context.Context
    49  
    50  	cli pb.MasterClient
    51  	ss  []*sqlsmith.SQLSmith
    52  
    53  	sourceDBs   []*conn.BaseDB
    54  	sourceConns []*dbConn
    55  	targetDB    *conn.BaseDB
    56  	targetConn  *dbConn
    57  
    58  	schema  string
    59  	tables  []string
    60  	taskCfg config2.TaskConfig
    61  	results results
    62  
    63  	caseGenerator *CaseGenerator
    64  }
    65  
    66  // newTask creates a new task instance.
    67  func newTask(ctx context.Context, cli pb.MasterClient, taskFile string, schema string,
    68  	targetCfg dbconfig.DBConfig, sourcesCfg ...dbconfig.DBConfig,
    69  ) (*task, error) {
    70  	var taskCfg config2.TaskConfig
    71  	err := taskCfg.DecodeFile(taskFile)
    72  	if err != nil {
    73  		return nil, err
    74  	}
    75  	taskCfg.TargetDB = &targetCfg // replace DB config
    76  
    77  	var (
    78  		sourceDBs   = make([]*conn.BaseDB, 0, len(taskCfg.MySQLInstances))
    79  		sourceConns = make([]*dbConn, 0, len(taskCfg.MySQLInstances))
    80  		res         = make(results, 0, len(taskCfg.MySQLInstances))
    81  	)
    82  	for i, m := range taskCfg.MySQLInstances { // only use necessary part of sources.
    83  		// reset Syncer, otherwise will report ERROR 20017
    84  		if len(m.SyncerConfigName) > 0 && m.Syncer != nil {
    85  			m.Syncer = nil
    86  		}
    87  
    88  		cfg := sourcesCfg[i]
    89  		db, err2 := conn.GetUpstreamDB(&cfg)
    90  		if err2 != nil {
    91  			return nil, err2
    92  		}
    93  		dbConnection, err2 := createDBConn(ctx, db, schema)
    94  		if err2 != nil {
    95  			return nil, err2
    96  		}
    97  		if taskCfg.CaseSensitive {
    98  			lcSetting, err2 := conn.FetchLowerCaseTableNamesSetting(ctx, dbConnection.baseConn)
    99  			if err2 != nil {
   100  				return nil, err2
   101  			}
   102  			if lcSetting == conn.LCTableNamesMixed {
   103  				msg := "can not set `case-sensitive = true` when upstream `lower_case_table_names = 2`"
   104  				log.L().Error(msg, zap.Any("instance", cfg))
   105  				return nil, errors.New(msg)
   106  			}
   107  		}
   108  		sourceDBs = append(sourceDBs, db)
   109  		sourceConns = append(sourceConns, dbConnection)
   110  		res = append(res, singleResult{})
   111  	}
   112  
   113  	targetDB, err := conn.GetDownstreamDB(&targetCfg)
   114  	if err != nil {
   115  		return nil, err
   116  	}
   117  	targetConn, err := createDBConn(ctx, targetDB, schema)
   118  	if err != nil {
   119  		return nil, err
   120  	}
   121  
   122  	t := &task{
   123  		logger:        log.L().WithFields(zap.String("case", taskCfg.Name)),
   124  		ctx:           ctx,
   125  		cli:           cli,
   126  		ss:            make([]*sqlsmith.SQLSmith, len(taskCfg.MySQLInstances)),
   127  		sourceDBs:     sourceDBs,
   128  		sourceConns:   sourceConns,
   129  		targetDB:      targetDB,
   130  		targetConn:    targetConn,
   131  		schema:        schema,
   132  		tables:        make([]string, 0),
   133  		taskCfg:       taskCfg,
   134  		results:       res,
   135  		caseGenerator: NewCaseGenerator(taskCfg.ShardMode),
   136  	}
   137  	for i := 0; i < len(t.ss); i++ {
   138  		t.ss[i] = sqlsmith.New()
   139  		t.ss[i].SetDB(schema)
   140  	}
   141  	return t, nil
   142  }
   143  
   144  // run runs the case.
   145  func (t *task) run() error {
   146  	defer func() {
   147  		for _, db := range t.sourceDBs {
   148  			db.Close()
   149  		}
   150  		t.targetDB.Close()
   151  
   152  		t.logger.Info("task runs results", zap.Stringer("results", t.results))
   153  	}()
   154  
   155  	if err := t.stopPreviousTask(); err != nil {
   156  		return err
   157  	}
   158  	if err := t.clearPreviousData(); err != nil {
   159  		return err
   160  	}
   161  
   162  	if err := t.genFullData(); err != nil {
   163  		return err
   164  	}
   165  
   166  	if err := t.createTask(); err != nil {
   167  		return err
   168  	}
   169  
   170  	t.logger.Info("check data for full stage")
   171  	sourceDBs := make([]*sql.DB, 0, len(t.sourceDBs))
   172  	for _, db := range t.sourceDBs {
   173  		sourceDBs = append(sourceDBs, db.DB)
   174  	}
   175  	if err := diffDataLoop(t.ctx, diffCount, diffInterval, t.schema, t.tables, t.targetDB.DB, sourceDBs...); err != nil {
   176  		return err
   177  	}
   178  
   179  	return t.incrLoop()
   180  }
   181  
   182  // stopPreviousTask stops the previous task with the same name if exists.
   183  func (t *task) stopPreviousTask() error {
   184  	t.logger.Info("stopping previous task")
   185  	resp, err := t.cli.OperateTask(t.ctx, &pb.OperateTaskRequest{
   186  		Op:   pb.TaskOp_Delete,
   187  		Name: t.taskCfg.Name,
   188  	})
   189  	if err != nil {
   190  		return err
   191  	} else if !resp.Result && !strings.Contains(resp.Msg, "not exist") {
   192  		return fmt.Errorf("fail to stop task: %s", resp.Msg)
   193  	}
   194  	return nil
   195  }
   196  
   197  // clearPreviousData clears previous data in upstream source and downstream target.
   198  func (t *task) clearPreviousData() error {
   199  	t.logger.Info("clearing previous source and target data")
   200  	for _, conn := range t.sourceConns {
   201  		if err := dropDatabase(t.ctx, conn, t.schema); err != nil {
   202  			return err
   203  		}
   204  	}
   205  	return dropDatabase(t.ctx, t.targetConn, t.schema)
   206  }
   207  
   208  // genFullData generates data for the full stage.
   209  func (t *task) genFullData() error {
   210  	t.logger.Info("generating data for full stage")
   211  	for _, conn := range t.sourceConns {
   212  		if err := createDatabase(t.ctx, conn, t.schema); err != nil {
   213  			return err
   214  		}
   215  		// NOTE: we set CURRENT database here.
   216  		if err := conn.execSQLs(t.ctx, fmt.Sprintf("USE %s", t.schema)); err != nil {
   217  			return err
   218  		}
   219  	}
   220  
   221  	var (
   222  		columns = make([][5]string, 0)
   223  		indexes = make(map[string][]string)
   224  	)
   225  
   226  	// generate `CREATE TABLE` statements.
   227  	for i := 0; i < tableCount; i++ {
   228  		query, name, err := t.ss[0].CreateTableStmt()
   229  		if err != nil {
   230  			return err
   231  		}
   232  		t.logger.Info("creating table", zap.String("query", query))
   233  		for j, conn := range t.sourceConns {
   234  			if err = conn.execSQLs(t.ctx, query); err != nil {
   235  				return err
   236  			}
   237  			// set different `AUTO_INCREMENT` to avoid encplicate entry for `INSERT`.
   238  			if err = conn.execSQLs(t.ctx, fmt.Sprintf("ALTER TABLE %s AUTO_INCREMENT = %d", name, 1+j*100000000)); err != nil {
   239  				return err
   240  			}
   241  		}
   242  		t.tables = append(t.tables, name)
   243  
   244  		col2, idx2, err := createTableToSmithSchema(t.schema, query)
   245  		if err != nil {
   246  			return err
   247  		}
   248  		columns = append(columns, col2...)
   249  		indexes[name] = idx2
   250  	}
   251  
   252  	for i := 0; i < len(t.ss); i++ {
   253  		// go-sqlsmith needs to load schema before generating DML and `ALTER TABLE` statements.
   254  		t.ss[i].LoadSchema(columns, indexes)
   255  	}
   256  
   257  	var eg errgroup.Group
   258  	for _, conn := range t.sourceConns {
   259  		conn2 := conn
   260  		eg.Go(func() error {
   261  			for i := 0; i < fullInsertCount; i++ {
   262  				query, _, err2 := t.ss[0].InsertStmt(false)
   263  				if err2 != nil {
   264  					return err2
   265  				}
   266  				if err2 = conn2.execSQLs(t.ctx, query); err2 != nil {
   267  					return err2
   268  				}
   269  			}
   270  			return nil
   271  		})
   272  	}
   273  	return eg.Wait()
   274  }
   275  
   276  // createTask does `start-task` operation.
   277  func (t *task) createTask() error {
   278  	t.logger.Info("starting the task", zap.String("task cfg", t.taskCfg.String()))
   279  	resp, err := t.cli.StartTask(t.ctx, &pb.StartTaskRequest{
   280  		Task: t.taskCfg.String(),
   281  	})
   282  	if err != nil {
   283  		return err
   284  	} else if !resp.Result && !strings.Contains(resp.Msg, "already exist") { // imprecise match
   285  		return fmt.Errorf("fail to start task: %s", resp.Msg)
   286  	}
   287  	return nil
   288  }
   289  
   290  // incrLoop enters the loop of generating incremental data and diff them.
   291  func (t *task) incrLoop() error {
   292  	t.caseGenerator.Start(t.ctx, t.schema, t.tables)
   293  
   294  	// execute preSQLs in upstream
   295  	for _, sql := range t.caseGenerator.GetPreSQLs() {
   296  		if err := t.sourceConns[sql.source].execDDLs(t.ctx, sql.statement); err != nil {
   297  			return err
   298  		}
   299  	}
   300  	if err := t.updateSchema(); err != nil {
   301  		return err
   302  	}
   303  
   304  	for {
   305  		select {
   306  		case <-t.ctx.Done():
   307  			return nil
   308  		default:
   309  			ctx2, cancel2 := context.WithTimeout(t.ctx, incrRoundTime)
   310  			// generate data
   311  			err := t.genIncrData(ctx2)
   312  			if err != nil {
   313  				cancel2()
   314  				return err
   315  			}
   316  
   317  			// diff data
   318  			err = t.diffIncrData(t.ctx)
   319  			if err != nil {
   320  				cancel2()
   321  				return err
   322  			}
   323  			cancel2()
   324  		}
   325  	}
   326  }
   327  
   328  // genIncrData generates data for the incremental stage in one round.
   329  // NOTE: it return nil for context done.
   330  func (t *task) genIncrData(pCtx context.Context) (err error) {
   331  	t.logger.Info("generating data for incremental stage")
   332  	getNewCase := true
   333  
   334  	defer func() {
   335  		if errors.Cause(err) == context.Canceled || errors.Cause(err) == context.DeadlineExceeded {
   336  			log.L().Info("context done.", log.ShortError(err))
   337  			err = nil // clear error for context done.
   338  		} else if err != nil {
   339  			select {
   340  			case <-pCtx.Done():
   341  				t.logger.Warn("ignore error when generating data for incremental stage", zap.Error(err))
   342  				err = nil // some other errors like `connection is already closed` may also be reported for context done.
   343  			default:
   344  				if forceIgnoreExecSQLError(err) {
   345  					t.logger.Warn("ignore error when generating data for incremental stage", zap.Error(err))
   346  					// we don't known which connection was bad, so simply reset all of them for the next round.
   347  					for _, conn := range t.sourceConns {
   348  						if err2 := conn.resetConn(t.ctx); err2 != nil {
   349  							t.logger.Warn("fail to reset connection", zap.Error(err2))
   350  						}
   351  						err = nil
   352  					}
   353  				}
   354  			}
   355  		}
   356  	}()
   357  
   358  	runCaseSQLs := func() error {
   359  		testSQLs := t.caseGenerator.GetSQLs()
   360  		if testSQLs == nil {
   361  			getNewCase = false
   362  			return nil
   363  		}
   364  		for _, testSQL := range testSQLs {
   365  			log.L().Info("execute test case sql", zap.String("ddl", testSQL.statement), zap.Int("source", testSQL.source))
   366  			if err2 := t.sourceConns[testSQL.source].execDDLs(t.ctx, testSQL.statement); err2 != nil {
   367  				return err2
   368  			}
   369  		}
   370  		return nil
   371  	}
   372  
   373  	defer func() {
   374  		log.L().Info("complete test case sql")
   375  		for {
   376  			if !getNewCase {
   377  				return
   378  			}
   379  
   380  			if err2 := runCaseSQLs(); err2 != nil {
   381  				err = err2
   382  				return
   383  			}
   384  			if err2 := t.updateSchema(); err2 != nil {
   385  				err = err2
   386  				return
   387  			}
   388  		}
   389  	}()
   390  
   391  	for {
   392  		select {
   393  		case <-pCtx.Done():
   394  			return nil
   395  		default:
   396  		}
   397  
   398  		// for DML, we rand choose an upstream source to execute the statement.
   399  		idx := rand.Intn(len(t.sourceConns))
   400  		query, typ, err := randDML(t.ss[idx])
   401  		if err != nil {
   402  			return err
   403  		}
   404  		if err = t.sourceConns[idx].execDDLs(t.ctx, query); err != nil {
   405  			return err
   406  		}
   407  
   408  		switch typ {
   409  		case insertDML:
   410  			t.results[idx].Insert++
   411  		case updateDML:
   412  			t.results[idx].Update++
   413  		case deleteDML:
   414  			t.results[idx].Delete++
   415  		default:
   416  		}
   417  
   418  		schemaChanged := false
   419  		if rand.Intn(3000) < 10 {
   420  			query, err = randDDL(t.ss[0])
   421  			if err != nil {
   422  				return err
   423  			}
   424  
   425  			// Unsupported ddl in optimistic mode. e.g. ALTER TABLE table_name ADD column column_name INT NOT NULL;
   426  			if t.taskCfg.ShardMode == config2.ShardOptimistic {
   427  				if yes, err2 := isNotNullNonDefaultAddCol(query); err != nil {
   428  					return err2
   429  				} else if yes {
   430  					continue
   431  				}
   432  			}
   433  
   434  			t.logger.Info("executing DDL", zap.String("query", query))
   435  			// for DDL, we execute the statement for all upstream sources.
   436  			// NOTE: no re-order inject even for optimistic shard DDL now.
   437  
   438  			var eg errgroup.Group
   439  			for i, c := range t.sourceConns {
   440  				conn2 := c
   441  				i2 := i
   442  				eg.Go(func() error {
   443  					if err2 := conn2.execDDLs(t.ctx, query); err2 != nil {
   444  						if conn.IsMySQLError(err2, mysql.ErrDupFieldName) {
   445  							t.logger.Warn("ignore duplicate field name for ddl", log.ShortError(err))
   446  							return nil
   447  						}
   448  						return err2
   449  					}
   450  					t.results[i2].DDL++
   451  					return nil
   452  				})
   453  			}
   454  			if err = eg.Wait(); err != nil {
   455  				return err
   456  			}
   457  
   458  			schemaChanged = true
   459  		}
   460  
   461  		if getNewCase && rand.Intn(100) < 10 {
   462  			// execute sql of test cases
   463  			if err = runCaseSQLs(); err != nil {
   464  				return err
   465  			}
   466  
   467  			schemaChanged = true
   468  		}
   469  
   470  		if schemaChanged {
   471  			if err = t.updateSchema(); err != nil {
   472  				return err
   473  			}
   474  		}
   475  	}
   476  }
   477  
   478  // diffIncrData checks data equal for the incremental stage in one round.
   479  // NOTE: it return nil for context done.
   480  func (t *task) diffIncrData(ctx context.Context) (err error) {
   481  	t.logger.Info("check data for incremental stage")
   482  
   483  	defer func() {
   484  		if errors.Cause(err) == context.Canceled || errors.Cause(err) == context.DeadlineExceeded {
   485  			err = nil // clear error for context done.
   486  		} else if err != nil {
   487  			select {
   488  			case <-ctx.Done():
   489  				t.logger.Warn("ignore error when check data for incremental stage", zap.Error(err))
   490  				err = nil // some other errors like `connection is already closed` may also be reported for context done.
   491  			default:
   492  			}
   493  		}
   494  	}()
   495  
   496  	sourceDBs := make([]*sql.DB, 0, len(t.sourceDBs))
   497  	for _, db := range t.sourceDBs {
   498  		sourceDBs = append(sourceDBs, db.DB)
   499  	}
   500  	return diffDataLoop(ctx, diffCount, diffInterval, t.schema, t.tables, t.targetDB.DB, sourceDBs...)
   501  }
   502  
   503  func (t *task) updateSchema() error {
   504  	ctx, cancel := context.WithTimeout(context.Background(), conn.DefaultDBTimeout)
   505  	defer cancel()
   506  
   507  	for i, db := range t.sourceDBs {
   508  		columns := make([][5]string, 0)
   509  		indexes := make(map[string][]string)
   510  		for _, table := range t.tables {
   511  			createTable, err := dbutil.GetCreateTableSQL(ctx, db.DB, t.schema, table)
   512  			if err != nil {
   513  				return err
   514  			}
   515  			col, idx, err := createTableToSmithSchema(t.schema, createTable)
   516  			if err != nil {
   517  				return err
   518  			}
   519  			columns = append(columns, col...)
   520  			indexes[table] = idx
   521  		}
   522  		t.ss[i] = sqlsmith.New()
   523  		t.ss[i].SetDB(t.schema)
   524  		t.ss[i].LoadSchema(columns, indexes)
   525  	}
   526  	return nil
   527  }