github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/chaos/cases/task.go (about) 1 // Copyright 2020 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package main 15 16 import ( 17 "context" 18 "database/sql" 19 "fmt" 20 "math/rand" 21 "strings" 22 "time" 23 24 "github.com/chaos-mesh/go-sqlsmith" 25 "github.com/pingcap/errors" 26 "github.com/pingcap/tidb/pkg/parser/mysql" 27 "github.com/pingcap/tidb/pkg/util/dbutil" 28 config2 "github.com/pingcap/tiflow/dm/config" 29 "github.com/pingcap/tiflow/dm/config/dbconfig" 30 "github.com/pingcap/tiflow/dm/pb" 31 "github.com/pingcap/tiflow/dm/pkg/conn" 32 "github.com/pingcap/tiflow/dm/pkg/log" 33 "go.uber.org/zap" 34 "golang.org/x/sync/errgroup" 35 ) 36 37 const ( 38 tableCount = 10 // tables count in schema. 39 fullInsertCount = 100 // `INSERT INTO` count (not rows count) for each table in full stage. 40 diffCount = 20 // diff data check count 41 diffInterval = 20 * time.Second // diff data check interval 42 incrRoundTime = 10 * time.Second // time to generate incremental data in one round 43 ) 44 45 // task is a data migration task test case with one or more sources. 46 type task struct { 47 logger log.Logger 48 ctx context.Context 49 50 cli pb.MasterClient 51 ss []*sqlsmith.SQLSmith 52 53 sourceDBs []*conn.BaseDB 54 sourceConns []*dbConn 55 targetDB *conn.BaseDB 56 targetConn *dbConn 57 58 schema string 59 tables []string 60 taskCfg config2.TaskConfig 61 results results 62 63 caseGenerator *CaseGenerator 64 } 65 66 // newTask creates a new task instance. 67 func newTask(ctx context.Context, cli pb.MasterClient, taskFile string, schema string, 68 targetCfg dbconfig.DBConfig, sourcesCfg ...dbconfig.DBConfig, 69 ) (*task, error) { 70 var taskCfg config2.TaskConfig 71 err := taskCfg.DecodeFile(taskFile) 72 if err != nil { 73 return nil, err 74 } 75 taskCfg.TargetDB = &targetCfg // replace DB config 76 77 var ( 78 sourceDBs = make([]*conn.BaseDB, 0, len(taskCfg.MySQLInstances)) 79 sourceConns = make([]*dbConn, 0, len(taskCfg.MySQLInstances)) 80 res = make(results, 0, len(taskCfg.MySQLInstances)) 81 ) 82 for i, m := range taskCfg.MySQLInstances { // only use necessary part of sources. 83 // reset Syncer, otherwise will report ERROR 20017 84 if len(m.SyncerConfigName) > 0 && m.Syncer != nil { 85 m.Syncer = nil 86 } 87 88 cfg := sourcesCfg[i] 89 db, err2 := conn.GetUpstreamDB(&cfg) 90 if err2 != nil { 91 return nil, err2 92 } 93 dbConnection, err2 := createDBConn(ctx, db, schema) 94 if err2 != nil { 95 return nil, err2 96 } 97 if taskCfg.CaseSensitive { 98 lcSetting, err2 := conn.FetchLowerCaseTableNamesSetting(ctx, dbConnection.baseConn) 99 if err2 != nil { 100 return nil, err2 101 } 102 if lcSetting == conn.LCTableNamesMixed { 103 msg := "can not set `case-sensitive = true` when upstream `lower_case_table_names = 2`" 104 log.L().Error(msg, zap.Any("instance", cfg)) 105 return nil, errors.New(msg) 106 } 107 } 108 sourceDBs = append(sourceDBs, db) 109 sourceConns = append(sourceConns, dbConnection) 110 res = append(res, singleResult{}) 111 } 112 113 targetDB, err := conn.GetDownstreamDB(&targetCfg) 114 if err != nil { 115 return nil, err 116 } 117 targetConn, err := createDBConn(ctx, targetDB, schema) 118 if err != nil { 119 return nil, err 120 } 121 122 t := &task{ 123 logger: log.L().WithFields(zap.String("case", taskCfg.Name)), 124 ctx: ctx, 125 cli: cli, 126 ss: make([]*sqlsmith.SQLSmith, len(taskCfg.MySQLInstances)), 127 sourceDBs: sourceDBs, 128 sourceConns: sourceConns, 129 targetDB: targetDB, 130 targetConn: targetConn, 131 schema: schema, 132 tables: make([]string, 0), 133 taskCfg: taskCfg, 134 results: res, 135 caseGenerator: NewCaseGenerator(taskCfg.ShardMode), 136 } 137 for i := 0; i < len(t.ss); i++ { 138 t.ss[i] = sqlsmith.New() 139 t.ss[i].SetDB(schema) 140 } 141 return t, nil 142 } 143 144 // run runs the case. 145 func (t *task) run() error { 146 defer func() { 147 for _, db := range t.sourceDBs { 148 db.Close() 149 } 150 t.targetDB.Close() 151 152 t.logger.Info("task runs results", zap.Stringer("results", t.results)) 153 }() 154 155 if err := t.stopPreviousTask(); err != nil { 156 return err 157 } 158 if err := t.clearPreviousData(); err != nil { 159 return err 160 } 161 162 if err := t.genFullData(); err != nil { 163 return err 164 } 165 166 if err := t.createTask(); err != nil { 167 return err 168 } 169 170 t.logger.Info("check data for full stage") 171 sourceDBs := make([]*sql.DB, 0, len(t.sourceDBs)) 172 for _, db := range t.sourceDBs { 173 sourceDBs = append(sourceDBs, db.DB) 174 } 175 if err := diffDataLoop(t.ctx, diffCount, diffInterval, t.schema, t.tables, t.targetDB.DB, sourceDBs...); err != nil { 176 return err 177 } 178 179 return t.incrLoop() 180 } 181 182 // stopPreviousTask stops the previous task with the same name if exists. 183 func (t *task) stopPreviousTask() error { 184 t.logger.Info("stopping previous task") 185 resp, err := t.cli.OperateTask(t.ctx, &pb.OperateTaskRequest{ 186 Op: pb.TaskOp_Delete, 187 Name: t.taskCfg.Name, 188 }) 189 if err != nil { 190 return err 191 } else if !resp.Result && !strings.Contains(resp.Msg, "not exist") { 192 return fmt.Errorf("fail to stop task: %s", resp.Msg) 193 } 194 return nil 195 } 196 197 // clearPreviousData clears previous data in upstream source and downstream target. 198 func (t *task) clearPreviousData() error { 199 t.logger.Info("clearing previous source and target data") 200 for _, conn := range t.sourceConns { 201 if err := dropDatabase(t.ctx, conn, t.schema); err != nil { 202 return err 203 } 204 } 205 return dropDatabase(t.ctx, t.targetConn, t.schema) 206 } 207 208 // genFullData generates data for the full stage. 209 func (t *task) genFullData() error { 210 t.logger.Info("generating data for full stage") 211 for _, conn := range t.sourceConns { 212 if err := createDatabase(t.ctx, conn, t.schema); err != nil { 213 return err 214 } 215 // NOTE: we set CURRENT database here. 216 if err := conn.execSQLs(t.ctx, fmt.Sprintf("USE %s", t.schema)); err != nil { 217 return err 218 } 219 } 220 221 var ( 222 columns = make([][5]string, 0) 223 indexes = make(map[string][]string) 224 ) 225 226 // generate `CREATE TABLE` statements. 227 for i := 0; i < tableCount; i++ { 228 query, name, err := t.ss[0].CreateTableStmt() 229 if err != nil { 230 return err 231 } 232 t.logger.Info("creating table", zap.String("query", query)) 233 for j, conn := range t.sourceConns { 234 if err = conn.execSQLs(t.ctx, query); err != nil { 235 return err 236 } 237 // set different `AUTO_INCREMENT` to avoid encplicate entry for `INSERT`. 238 if err = conn.execSQLs(t.ctx, fmt.Sprintf("ALTER TABLE %s AUTO_INCREMENT = %d", name, 1+j*100000000)); err != nil { 239 return err 240 } 241 } 242 t.tables = append(t.tables, name) 243 244 col2, idx2, err := createTableToSmithSchema(t.schema, query) 245 if err != nil { 246 return err 247 } 248 columns = append(columns, col2...) 249 indexes[name] = idx2 250 } 251 252 for i := 0; i < len(t.ss); i++ { 253 // go-sqlsmith needs to load schema before generating DML and `ALTER TABLE` statements. 254 t.ss[i].LoadSchema(columns, indexes) 255 } 256 257 var eg errgroup.Group 258 for _, conn := range t.sourceConns { 259 conn2 := conn 260 eg.Go(func() error { 261 for i := 0; i < fullInsertCount; i++ { 262 query, _, err2 := t.ss[0].InsertStmt(false) 263 if err2 != nil { 264 return err2 265 } 266 if err2 = conn2.execSQLs(t.ctx, query); err2 != nil { 267 return err2 268 } 269 } 270 return nil 271 }) 272 } 273 return eg.Wait() 274 } 275 276 // createTask does `start-task` operation. 277 func (t *task) createTask() error { 278 t.logger.Info("starting the task", zap.String("task cfg", t.taskCfg.String())) 279 resp, err := t.cli.StartTask(t.ctx, &pb.StartTaskRequest{ 280 Task: t.taskCfg.String(), 281 }) 282 if err != nil { 283 return err 284 } else if !resp.Result && !strings.Contains(resp.Msg, "already exist") { // imprecise match 285 return fmt.Errorf("fail to start task: %s", resp.Msg) 286 } 287 return nil 288 } 289 290 // incrLoop enters the loop of generating incremental data and diff them. 291 func (t *task) incrLoop() error { 292 t.caseGenerator.Start(t.ctx, t.schema, t.tables) 293 294 // execute preSQLs in upstream 295 for _, sql := range t.caseGenerator.GetPreSQLs() { 296 if err := t.sourceConns[sql.source].execDDLs(t.ctx, sql.statement); err != nil { 297 return err 298 } 299 } 300 if err := t.updateSchema(); err != nil { 301 return err 302 } 303 304 for { 305 select { 306 case <-t.ctx.Done(): 307 return nil 308 default: 309 ctx2, cancel2 := context.WithTimeout(t.ctx, incrRoundTime) 310 // generate data 311 err := t.genIncrData(ctx2) 312 if err != nil { 313 cancel2() 314 return err 315 } 316 317 // diff data 318 err = t.diffIncrData(t.ctx) 319 if err != nil { 320 cancel2() 321 return err 322 } 323 cancel2() 324 } 325 } 326 } 327 328 // genIncrData generates data for the incremental stage in one round. 329 // NOTE: it return nil for context done. 330 func (t *task) genIncrData(pCtx context.Context) (err error) { 331 t.logger.Info("generating data for incremental stage") 332 getNewCase := true 333 334 defer func() { 335 if errors.Cause(err) == context.Canceled || errors.Cause(err) == context.DeadlineExceeded { 336 log.L().Info("context done.", log.ShortError(err)) 337 err = nil // clear error for context done. 338 } else if err != nil { 339 select { 340 case <-pCtx.Done(): 341 t.logger.Warn("ignore error when generating data for incremental stage", zap.Error(err)) 342 err = nil // some other errors like `connection is already closed` may also be reported for context done. 343 default: 344 if forceIgnoreExecSQLError(err) { 345 t.logger.Warn("ignore error when generating data for incremental stage", zap.Error(err)) 346 // we don't known which connection was bad, so simply reset all of them for the next round. 347 for _, conn := range t.sourceConns { 348 if err2 := conn.resetConn(t.ctx); err2 != nil { 349 t.logger.Warn("fail to reset connection", zap.Error(err2)) 350 } 351 err = nil 352 } 353 } 354 } 355 } 356 }() 357 358 runCaseSQLs := func() error { 359 testSQLs := t.caseGenerator.GetSQLs() 360 if testSQLs == nil { 361 getNewCase = false 362 return nil 363 } 364 for _, testSQL := range testSQLs { 365 log.L().Info("execute test case sql", zap.String("ddl", testSQL.statement), zap.Int("source", testSQL.source)) 366 if err2 := t.sourceConns[testSQL.source].execDDLs(t.ctx, testSQL.statement); err2 != nil { 367 return err2 368 } 369 } 370 return nil 371 } 372 373 defer func() { 374 log.L().Info("complete test case sql") 375 for { 376 if !getNewCase { 377 return 378 } 379 380 if err2 := runCaseSQLs(); err2 != nil { 381 err = err2 382 return 383 } 384 if err2 := t.updateSchema(); err2 != nil { 385 err = err2 386 return 387 } 388 } 389 }() 390 391 for { 392 select { 393 case <-pCtx.Done(): 394 return nil 395 default: 396 } 397 398 // for DML, we rand choose an upstream source to execute the statement. 399 idx := rand.Intn(len(t.sourceConns)) 400 query, typ, err := randDML(t.ss[idx]) 401 if err != nil { 402 return err 403 } 404 if err = t.sourceConns[idx].execDDLs(t.ctx, query); err != nil { 405 return err 406 } 407 408 switch typ { 409 case insertDML: 410 t.results[idx].Insert++ 411 case updateDML: 412 t.results[idx].Update++ 413 case deleteDML: 414 t.results[idx].Delete++ 415 default: 416 } 417 418 schemaChanged := false 419 if rand.Intn(3000) < 10 { 420 query, err = randDDL(t.ss[0]) 421 if err != nil { 422 return err 423 } 424 425 // Unsupported ddl in optimistic mode. e.g. ALTER TABLE table_name ADD column column_name INT NOT NULL; 426 if t.taskCfg.ShardMode == config2.ShardOptimistic { 427 if yes, err2 := isNotNullNonDefaultAddCol(query); err != nil { 428 return err2 429 } else if yes { 430 continue 431 } 432 } 433 434 t.logger.Info("executing DDL", zap.String("query", query)) 435 // for DDL, we execute the statement for all upstream sources. 436 // NOTE: no re-order inject even for optimistic shard DDL now. 437 438 var eg errgroup.Group 439 for i, c := range t.sourceConns { 440 conn2 := c 441 i2 := i 442 eg.Go(func() error { 443 if err2 := conn2.execDDLs(t.ctx, query); err2 != nil { 444 if conn.IsMySQLError(err2, mysql.ErrDupFieldName) { 445 t.logger.Warn("ignore duplicate field name for ddl", log.ShortError(err)) 446 return nil 447 } 448 return err2 449 } 450 t.results[i2].DDL++ 451 return nil 452 }) 453 } 454 if err = eg.Wait(); err != nil { 455 return err 456 } 457 458 schemaChanged = true 459 } 460 461 if getNewCase && rand.Intn(100) < 10 { 462 // execute sql of test cases 463 if err = runCaseSQLs(); err != nil { 464 return err 465 } 466 467 schemaChanged = true 468 } 469 470 if schemaChanged { 471 if err = t.updateSchema(); err != nil { 472 return err 473 } 474 } 475 } 476 } 477 478 // diffIncrData checks data equal for the incremental stage in one round. 479 // NOTE: it return nil for context done. 480 func (t *task) diffIncrData(ctx context.Context) (err error) { 481 t.logger.Info("check data for incremental stage") 482 483 defer func() { 484 if errors.Cause(err) == context.Canceled || errors.Cause(err) == context.DeadlineExceeded { 485 err = nil // clear error for context done. 486 } else if err != nil { 487 select { 488 case <-ctx.Done(): 489 t.logger.Warn("ignore error when check data for incremental stage", zap.Error(err)) 490 err = nil // some other errors like `connection is already closed` may also be reported for context done. 491 default: 492 } 493 } 494 }() 495 496 sourceDBs := make([]*sql.DB, 0, len(t.sourceDBs)) 497 for _, db := range t.sourceDBs { 498 sourceDBs = append(sourceDBs, db.DB) 499 } 500 return diffDataLoop(ctx, diffCount, diffInterval, t.schema, t.tables, t.targetDB.DB, sourceDBs...) 501 } 502 503 func (t *task) updateSchema() error { 504 ctx, cancel := context.WithTimeout(context.Background(), conn.DefaultDBTimeout) 505 defer cancel() 506 507 for i, db := range t.sourceDBs { 508 columns := make([][5]string, 0) 509 indexes := make(map[string][]string) 510 for _, table := range t.tables { 511 createTable, err := dbutil.GetCreateTableSQL(ctx, db.DB, t.schema, table) 512 if err != nil { 513 return err 514 } 515 col, idx, err := createTableToSmithSchema(t.schema, createTable) 516 if err != nil { 517 return err 518 } 519 columns = append(columns, col...) 520 indexes[table] = idx 521 } 522 t.ss[i] = sqlsmith.New() 523 t.ss[i].SetDB(t.schema) 524 t.ss[i].LoadSchema(columns, indexes) 525 } 526 return nil 527 }