github.com/pingcap/br@v5.3.0-alpha.0.20220125034240-ec59c7b6ce30+incompatible/pkg/lightning/restore/meta_manager.go (about) 1 // Copyright 2021 PingCAP, Inc. Licensed under Apache-2.0. 2 3 package restore 4 5 import ( 6 "context" 7 "database/sql" 8 "encoding/json" 9 "fmt" 10 "strings" 11 12 "github.com/pingcap/errors" 13 "github.com/pingcap/parser/model" 14 "github.com/pingcap/parser/mysql" 15 "go.uber.org/zap" 16 17 "github.com/pingcap/br/pkg/lightning/backend/tidb" 18 "github.com/pingcap/br/pkg/lightning/common" 19 "github.com/pingcap/br/pkg/lightning/log" 20 verify "github.com/pingcap/br/pkg/lightning/verification" 21 "github.com/pingcap/br/pkg/pdutil" 22 "github.com/pingcap/br/pkg/redact" 23 ) 24 25 type metaMgrBuilder interface { 26 Init(ctx context.Context) error 27 TaskMetaMgr(pd *pdutil.PdController) taskMetaMgr 28 TableMetaMgr(tr *TableRestore) tableMetaMgr 29 } 30 31 type dbMetaMgrBuilder struct { 32 db *sql.DB 33 taskID int64 34 schema string 35 needChecksum bool 36 } 37 38 func (b *dbMetaMgrBuilder) Init(ctx context.Context) error { 39 exec := common.SQLWithRetry{ 40 DB: b.db, 41 Logger: log.L(), 42 HideQueryLog: redact.NeedRedact(), 43 } 44 metaDBSQL := fmt.Sprintf("CREATE DATABASE IF NOT EXISTS %s", common.EscapeIdentifier(b.schema)) 45 if err := exec.Exec(ctx, "create meta schema", metaDBSQL); err != nil { 46 return errors.Annotate(err, "create meta schema failed") 47 } 48 taskMetaSQL := fmt.Sprintf(CreateTaskMetaTable, common.UniqueTable(b.schema, taskMetaTableName)) 49 if err := exec.Exec(ctx, "create meta table", taskMetaSQL); err != nil { 50 return errors.Annotate(err, "create task meta table failed") 51 } 52 tableMetaSQL := fmt.Sprintf(CreateTableMetadataTable, common.UniqueTable(b.schema, tableMetaTableName)) 53 if err := exec.Exec(ctx, "create meta table", tableMetaSQL); err != nil { 54 return errors.Annotate(err, "create table meta table failed") 55 } 56 return nil 57 } 58 59 func (b *dbMetaMgrBuilder) TaskMetaMgr(pd *pdutil.PdController) taskMetaMgr { 60 return &dbTaskMetaMgr{ 61 session: b.db, 62 taskID: b.taskID, 63 pd: pd, 64 tableName: common.UniqueTable(b.schema, taskMetaTableName), 65 schemaName: b.schema, 66 } 67 } 68 69 func (b *dbMetaMgrBuilder) TableMetaMgr(tr *TableRestore) tableMetaMgr { 70 return &dbTableMetaMgr{ 71 session: b.db, 72 taskID: b.taskID, 73 tr: tr, 74 tableName: common.UniqueTable(b.schema, tableMetaTableName), 75 needChecksum: b.needChecksum, 76 } 77 } 78 79 type tableMetaMgr interface { 80 InitTableMeta(ctx context.Context) error 81 AllocTableRowIDs(ctx context.Context, rawRowIDMax int64) (*verify.KVChecksum, int64, error) 82 UpdateTableStatus(ctx context.Context, status metaStatus) error 83 UpdateTableBaseChecksum(ctx context.Context, checksum *verify.KVChecksum) error 84 CheckAndUpdateLocalChecksum(ctx context.Context, checksum *verify.KVChecksum) (bool, *verify.KVChecksum, error) 85 FinishTable(ctx context.Context) error 86 } 87 88 type dbTableMetaMgr struct { 89 session *sql.DB 90 taskID int64 91 tr *TableRestore 92 tableName string 93 needChecksum bool 94 } 95 96 func (m *dbTableMetaMgr) InitTableMeta(ctx context.Context) error { 97 exec := &common.SQLWithRetry{ 98 DB: m.session, 99 Logger: m.tr.logger, 100 } 101 // avoid override existing metadata if the meta is already inserted. 102 stmt := fmt.Sprintf(`INSERT IGNORE INTO %s (task_id, table_id, table_name, status) values (?, ?, ?, ?)`, m.tableName) 103 task := m.tr.logger.Begin(zap.DebugLevel, "init table meta") 104 err := exec.Exec(ctx, "init table meta", stmt, m.taskID, m.tr.tableInfo.ID, m.tr.tableName, metaStatusInitial.String()) 105 task.End(zap.ErrorLevel, err) 106 return errors.Trace(err) 107 } 108 109 type metaStatus uint32 110 111 const ( 112 metaStatusInitial metaStatus = iota 113 metaStatusRowIDAllocated 114 metaStatusRestoreStarted 115 metaStatusRestoreFinished 116 metaStatusChecksuming 117 metaStatusChecksumSkipped 118 metaStatusFinished 119 ) 120 121 func (m metaStatus) String() string { 122 switch m { 123 case metaStatusInitial: 124 return "initialized" 125 case metaStatusRowIDAllocated: 126 return "allocated" 127 case metaStatusRestoreStarted: 128 return "restore" 129 case metaStatusRestoreFinished: 130 return "restore_finished" 131 case metaStatusChecksuming: 132 return "checksuming" 133 case metaStatusChecksumSkipped: 134 return "checksum_skipped" 135 case metaStatusFinished: 136 return "finish" 137 default: 138 panic(fmt.Sprintf("unexpected metaStatus value '%d'", m)) 139 } 140 } 141 142 func parseMetaStatus(s string) (metaStatus, error) { 143 switch s { 144 case "", "initialized": 145 return metaStatusInitial, nil 146 case "allocated": 147 return metaStatusRowIDAllocated, nil 148 case "restore": 149 return metaStatusRestoreStarted, nil 150 case "restore_finished": 151 return metaStatusRestoreFinished, nil 152 case "checksuming": 153 return metaStatusChecksuming, nil 154 case "checksum_skipped": 155 return metaStatusChecksumSkipped, nil 156 case "finish": 157 return metaStatusFinished, nil 158 default: 159 return metaStatusInitial, errors.Errorf("invalid meta status '%s'", s) 160 } 161 } 162 163 func (m *dbTableMetaMgr) AllocTableRowIDs(ctx context.Context, rawRowIDMax int64) (*verify.KVChecksum, int64, error) { 164 conn, err := m.session.Conn(ctx) 165 if err != nil { 166 return nil, 0, errors.Trace(err) 167 } 168 defer conn.Close() 169 exec := &common.SQLWithRetry{ 170 DB: m.session, 171 Logger: m.tr.logger, 172 } 173 var newRowIDBase, newRowIDMax int64 174 curStatus := metaStatusInitial 175 newStatus := metaStatusRowIDAllocated 176 var baseTotalKvs, baseTotalBytes, baseChecksum uint64 177 err = exec.Exec(ctx, "enable pessimistic transaction", "SET SESSION tidb_txn_mode = 'pessimistic';") 178 if err != nil { 179 return nil, 0, errors.Annotate(err, "enable pessimistic transaction failed") 180 } 181 needAutoID := common.TableHasAutoRowID(m.tr.tableInfo.Core) || m.tr.tableInfo.Core.GetAutoIncrementColInfo() != nil || m.tr.tableInfo.Core.ContainsAutoRandomBits() 182 err = exec.Transact(ctx, "init table allocator base", func(ctx context.Context, tx *sql.Tx) error { 183 query := fmt.Sprintf("SELECT task_id, row_id_base, row_id_max, total_kvs_base, total_bytes_base, checksum_base, status from %s WHERE table_id = ? FOR UPDATE", m.tableName) 184 rows, err := tx.QueryContext(ctx, query, m.tr.tableInfo.ID) 185 if err != nil { 186 return errors.Trace(err) 187 } 188 defer rows.Close() 189 var ( 190 metaTaskID, rowIDBase, rowIDMax, maxRowIDMax int64 191 totalKvs, totalBytes, checksum uint64 192 statusValue string 193 ) 194 for rows.Next() { 195 if err = rows.Scan(&metaTaskID, &rowIDBase, &rowIDMax, &totalKvs, &totalBytes, &checksum, &statusValue); err != nil { 196 return errors.Trace(err) 197 } 198 status, err := parseMetaStatus(statusValue) 199 if err != nil { 200 return errors.Annotatef(err, "invalid meta status '%s'", statusValue) 201 } 202 203 // skip finished meta 204 if status >= metaStatusFinished { 205 continue 206 } 207 208 if status == metaStatusChecksuming { 209 return errors.New("target table is calculating checksum, please wait unit the checksum is finished and try again.") 210 } 211 212 if metaTaskID == m.taskID { 213 curStatus = status 214 baseChecksum = checksum 215 baseTotalKvs = totalKvs 216 baseTotalBytes = totalBytes 217 if status >= metaStatusRowIDAllocated { 218 if rowIDMax-rowIDBase != rawRowIDMax { 219 return errors.Errorf("verify allocator base failed. local: '%d', meta: '%d'", rawRowIDMax, rowIDMax-rowIDBase) 220 } 221 newRowIDBase = rowIDBase 222 newRowIDMax = rowIDMax 223 break 224 } 225 continue 226 } 227 228 // other tasks has finished this logic, we needn't do again. 229 if status >= metaStatusRowIDAllocated { 230 newStatus = metaStatusRestoreStarted 231 } 232 233 if rowIDMax > maxRowIDMax { 234 maxRowIDMax = rowIDMax 235 } 236 } 237 238 // no enough info are available, fetch row_id max for table 239 if curStatus == metaStatusInitial { 240 if needAutoID && maxRowIDMax == 0 { 241 // NOTE: currently, if a table contains auto_incremental unique key and _tidb_rowid, 242 // the `show table next_row_id` will returns the unique key field only. 243 var autoIDField string 244 for _, col := range m.tr.tableInfo.Core.Columns { 245 if mysql.HasAutoIncrementFlag(col.Flag) { 246 autoIDField = col.Name.L 247 break 248 } else if mysql.HasPriKeyFlag(col.Flag) && m.tr.tableInfo.Core.AutoRandomBits > 0 { 249 autoIDField = col.Name.L 250 break 251 } 252 } 253 if len(autoIDField) == 0 && common.TableHasAutoRowID(m.tr.tableInfo.Core) { 254 autoIDField = model.ExtraHandleName.L 255 } 256 if len(autoIDField) == 0 { 257 return errors.Errorf("table %s contains auto increment id or _tidb_rowid, but target field not found", m.tr.tableName) 258 } 259 260 autoIDInfos, err := tidb.FetchTableAutoIDInfos(ctx, tx, m.tr.tableName) 261 if err != nil { 262 return errors.Trace(err) 263 } 264 found := false 265 for _, info := range autoIDInfos { 266 if strings.ToLower(info.Column) == autoIDField { 267 maxRowIDMax = info.NextID - 1 268 found = true 269 break 270 } 271 } 272 if !found { 273 return errors.Errorf("can't fetch previous auto id base for table %s field '%s'", m.tr.tableName, autoIDField) 274 } 275 } 276 newRowIDBase = maxRowIDMax 277 newRowIDMax = newRowIDBase + rawRowIDMax 278 // table contains no data, can skip checksum 279 if needAutoID && newRowIDBase == 0 && newStatus < metaStatusRestoreStarted { 280 newStatus = metaStatusRestoreStarted 281 } 282 query = fmt.Sprintf("update %s set row_id_base = ?, row_id_max = ?, status = ? where table_id = ? and task_id = ?", m.tableName) 283 _, err := tx.ExecContext(ctx, query, newRowIDBase, newRowIDMax, newStatus.String(), m.tr.tableInfo.ID, m.taskID) 284 if err != nil { 285 return errors.Trace(err) 286 } 287 288 curStatus = newStatus 289 } 290 return nil 291 }) 292 if err != nil { 293 return nil, 0, errors.Trace(err) 294 } 295 296 var checksum *verify.KVChecksum 297 // need to do checksum and update checksum meta since we are the first one. 298 if curStatus < metaStatusRestoreStarted { 299 // table contains data but haven't do checksum yet 300 if (newRowIDBase > 0 || !needAutoID) && m.needChecksum && baseTotalKvs == 0 { 301 remoteCk, err := DoChecksum(ctx, m.tr.tableInfo) 302 if err != nil { 303 return nil, 0, errors.Trace(err) 304 } 305 306 if remoteCk.Checksum != baseChecksum || remoteCk.TotalKVs != baseTotalKvs || remoteCk.TotalBytes != baseTotalBytes { 307 ck := verify.MakeKVChecksum(remoteCk.TotalBytes, remoteCk.TotalKVs, remoteCk.Checksum) 308 checksum = &ck 309 } 310 311 } 312 313 if checksum != nil { 314 if err = m.UpdateTableBaseChecksum(ctx, checksum); err != nil { 315 return nil, 0, errors.Trace(err) 316 } 317 318 m.tr.logger.Info("checksum before restore table", zap.Object("checksum", checksum)) 319 } else if err = m.UpdateTableStatus(ctx, metaStatusRestoreStarted); err != nil { 320 return nil, 0, errors.Trace(err) 321 } 322 } 323 if checksum == nil && baseTotalKvs > 0 { 324 ck := verify.MakeKVChecksum(baseTotalBytes, baseTotalKvs, baseChecksum) 325 checksum = &ck 326 } 327 log.L().Info("allocate table row_id base", zap.String("table", m.tr.tableName), 328 zap.Int64("row_id_base", newRowIDBase)) 329 if checksum != nil { 330 log.L().Info("checksum base", zap.Any("checksum", checksum)) 331 } 332 return checksum, newRowIDBase, nil 333 } 334 335 func (m *dbTableMetaMgr) UpdateTableBaseChecksum(ctx context.Context, checksum *verify.KVChecksum) error { 336 exec := &common.SQLWithRetry{ 337 DB: m.session, 338 Logger: m.tr.logger, 339 } 340 query := fmt.Sprintf("update %s set total_kvs_base = ?, total_bytes_base = ?, checksum_base = ?, status = ? where table_id = ? and task_id = ?", m.tableName) 341 342 return exec.Exec(ctx, "update base checksum", query, checksum.SumKVS(), 343 checksum.SumSize(), checksum.Sum(), metaStatusRestoreStarted.String(), m.tr.tableInfo.ID, m.taskID) 344 } 345 346 func (m *dbTableMetaMgr) UpdateTableStatus(ctx context.Context, status metaStatus) error { 347 exec := &common.SQLWithRetry{ 348 DB: m.session, 349 Logger: m.tr.logger, 350 } 351 query := fmt.Sprintf("update %s set status = ? where table_id = ? and task_id = ?", m.tableName) 352 return exec.Exec(ctx, "update meta status", query, status.String(), m.tr.tableInfo.ID, m.taskID) 353 } 354 355 func (m *dbTableMetaMgr) CheckAndUpdateLocalChecksum(ctx context.Context, checksum *verify.KVChecksum) (bool, *verify.KVChecksum, error) { 356 conn, err := m.session.Conn(ctx) 357 if err != nil { 358 return false, nil, errors.Trace(err) 359 } 360 defer conn.Close() 361 exec := &common.SQLWithRetry{ 362 DB: m.session, 363 Logger: m.tr.logger, 364 } 365 err = exec.Exec(ctx, "enable pessimistic transaction", "SET SESSION tidb_txn_mode = 'pessimistic';") 366 if err != nil { 367 return false, nil, errors.Annotate(err, "enable pessimistic transaction failed") 368 } 369 var ( 370 baseTotalKvs, baseTotalBytes, baseChecksum uint64 371 taskKvs, taskBytes, taskChecksum uint64 372 totalKvs, totalBytes, totalChecksum uint64 373 ) 374 newStatus := metaStatusChecksuming 375 needChecksum := true 376 err = exec.Transact(ctx, "checksum pre-check", func(ctx context.Context, tx *sql.Tx) error { 377 query := fmt.Sprintf("SELECT task_id, total_kvs_base, total_bytes_base, checksum_base, total_kvs, total_bytes, checksum, status from %s WHERE table_id = ? FOR UPDATE", m.tableName) 378 rows, err := tx.QueryContext(ctx, query, m.tr.tableInfo.ID) 379 if err != nil { 380 return errors.Annotate(err, "fetch task meta failed") 381 } 382 closed := false 383 defer func() { 384 if !closed { 385 rows.Close() 386 } 387 }() 388 var ( 389 taskID int64 390 statusValue string 391 ) 392 for rows.Next() { 393 if err = rows.Scan(&taskID, &baseTotalKvs, &baseTotalBytes, &baseChecksum, &taskKvs, &taskBytes, &taskChecksum, &statusValue); err != nil { 394 return errors.Trace(err) 395 } 396 status, err := parseMetaStatus(statusValue) 397 if err != nil { 398 return errors.Annotatef(err, "invalid meta status '%s'", statusValue) 399 } 400 401 // skip finished meta 402 if status >= metaStatusFinished { 403 continue 404 } 405 406 if taskID == m.taskID { 407 if status >= metaStatusChecksuming { 408 newStatus = status 409 needChecksum = status == metaStatusChecksuming 410 return nil 411 } 412 413 continue 414 } 415 416 if status < metaStatusChecksuming { 417 newStatus = metaStatusChecksumSkipped 418 needChecksum = false 419 break 420 } else if status == metaStatusChecksuming { 421 return errors.New("another task is checksuming, there must be something wrong!") 422 } 423 424 totalBytes += baseTotalBytes 425 totalKvs += baseTotalKvs 426 totalChecksum ^= baseChecksum 427 428 totalBytes += taskBytes 429 totalKvs += taskKvs 430 totalChecksum ^= taskChecksum 431 } 432 rows.Close() 433 closed = true 434 435 query = fmt.Sprintf("update %s set total_kvs = ?, total_bytes = ?, checksum = ?, status = ? where table_id = ? and task_id = ?", m.tableName) 436 _, err = tx.ExecContext(ctx, query, checksum.SumKVS(), checksum.SumSize(), checksum.Sum(), newStatus.String(), m.tr.tableInfo.ID, m.taskID) 437 return errors.Annotate(err, "update local checksum failed") 438 }) 439 if err != nil { 440 return false, nil, err 441 } 442 443 var remoteChecksum *verify.KVChecksum 444 if needChecksum { 445 ck := verify.MakeKVChecksum(totalBytes, totalKvs, totalChecksum) 446 remoteChecksum = &ck 447 } 448 log.L().Info("check table checksum", zap.String("table", m.tr.tableName), 449 zap.Bool("checksum", needChecksum), zap.String("new_status", newStatus.String())) 450 return needChecksum, remoteChecksum, nil 451 } 452 453 func (m *dbTableMetaMgr) FinishTable(ctx context.Context) error { 454 exec := &common.SQLWithRetry{ 455 DB: m.session, 456 Logger: m.tr.logger, 457 } 458 query := fmt.Sprintf("DELETE FROM %s where table_id = ? and (status = 'checksuming' or status = 'checksum_skipped')", m.tableName) 459 return exec.Exec(ctx, "clean up metas", query, m.tr.tableInfo.ID) 460 } 461 462 type taskMetaMgr interface { 463 InitTask(ctx context.Context, source int64) error 464 CheckClusterSource(ctx context.Context) (int64, error) 465 CheckTaskExist(ctx context.Context) (bool, error) 466 CheckAndPausePdSchedulers(ctx context.Context) (pdutil.UndoFunc, error) 467 // CheckAndFinishRestore check task meta and return whether to switch cluster to normal state and clean up the metadata 468 // Return values: first boolean indicates whether switch back tidb cluster to normal state (restore schedulers, switch tikv to normal) 469 // the second boolean indicates whether to clean up the metadata in tidb 470 CheckAndFinishRestore(ctx context.Context, finished bool) (shouldSwitchBack bool, shouldCleanupMeta bool, err error) 471 Cleanup(ctx context.Context) error 472 CleanupTask(ctx context.Context) error 473 CleanupAllMetas(ctx context.Context) error 474 Close() 475 } 476 477 type dbTaskMetaMgr struct { 478 session *sql.DB 479 taskID int64 480 pd *pdutil.PdController 481 // unique name of task meta table 482 tableName string 483 schemaName string 484 } 485 486 type taskMetaStatus uint32 487 488 const ( 489 taskMetaStatusInitial taskMetaStatus = iota 490 taskMetaStatusScheduleSet 491 taskMetaStatusSwitchSkipped 492 taskMetaStatusSwitchBack 493 ) 494 495 const ( 496 taskStateNormal int = iota 497 taskStateExited 498 ) 499 500 func (m taskMetaStatus) String() string { 501 switch m { 502 case taskMetaStatusInitial: 503 return "initialized" 504 case taskMetaStatusScheduleSet: 505 return "schedule_set" 506 case taskMetaStatusSwitchSkipped: 507 return "skip_switch" 508 case taskMetaStatusSwitchBack: 509 return "switched" 510 default: 511 panic(fmt.Sprintf("unexpected metaStatus value '%d'", m)) 512 } 513 } 514 515 func parseTaskMetaStatus(s string) (taskMetaStatus, error) { 516 switch s { 517 case "", "initialized": 518 return taskMetaStatusInitial, nil 519 case "schedule_set": 520 return taskMetaStatusScheduleSet, nil 521 case "skip_switch": 522 return taskMetaStatusSwitchSkipped, nil 523 case "switched": 524 return taskMetaStatusSwitchBack, nil 525 default: 526 return taskMetaStatusInitial, errors.Errorf("invalid meta status '%s'", s) 527 } 528 } 529 530 type storedCfgs struct { 531 PauseCfg pdutil.ClusterConfig `json:"paused"` 532 RestoreCfg pdutil.ClusterConfig `json:"restore"` 533 } 534 535 func (m *dbTaskMetaMgr) InitTask(ctx context.Context, source int64) error { 536 exec := &common.SQLWithRetry{ 537 DB: m.session, 538 Logger: log.L(), 539 } 540 // avoid override existing metadata if the meta is already inserted. 541 stmt := fmt.Sprintf(`INSERT INTO %s (task_id, status, source_bytes) values (?, ?, ?) ON DUPLICATE KEY UPDATE state = ?`, m.tableName) 542 err := exec.Exec(ctx, "init task meta", stmt, m.taskID, taskMetaStatusInitial.String(), source, taskStateNormal) 543 return errors.Trace(err) 544 } 545 546 func (m *dbTaskMetaMgr) CheckTaskExist(ctx context.Context) (bool, error) { 547 exec := &common.SQLWithRetry{ 548 DB: m.session, 549 Logger: log.L(), 550 } 551 // avoid override existing metadata if the meta is already inserted. 552 exist := false 553 err := exec.Transact(ctx, "check whether this task has started before", func(ctx context.Context, tx *sql.Tx) error { 554 query := fmt.Sprintf("SELECT task_id from %s WHERE task_id = %d", m.tableName, m.taskID) 555 rows, err := tx.QueryContext(ctx, query) 556 if err != nil { 557 return errors.Annotate(err, "fetch task meta failed") 558 } 559 var taskID int64 560 for rows.Next() { 561 if err = rows.Scan(&taskID); err != nil { 562 rows.Close() 563 return errors.Trace(err) 564 } 565 if taskID == m.taskID { 566 exist = true 567 } 568 } 569 err = rows.Close() 570 return errors.Trace(err) 571 }) 572 return exist, errors.Trace(err) 573 } 574 575 func (m *dbTaskMetaMgr) CheckClusterSource(ctx context.Context) (int64, error) { 576 conn, err := m.session.Conn(ctx) 577 if err != nil { 578 return 0, errors.Trace(err) 579 } 580 defer conn.Close() 581 exec := &common.SQLWithRetry{ 582 DB: m.session, 583 Logger: log.L(), 584 } 585 586 source := int64(0) 587 query := fmt.Sprintf("SELECT SUM(source_bytes) from %s", m.tableName) 588 if err := exec.QueryRow(ctx, "query total source size", query, &source); err != nil { 589 return 0, errors.Annotate(err, "fetch task meta failed") 590 } 591 return source, nil 592 } 593 594 func (m *dbTaskMetaMgr) CheckAndPausePdSchedulers(ctx context.Context) (pdutil.UndoFunc, error) { 595 pauseCtx, cancel := context.WithCancel(ctx) 596 conn, err := m.session.Conn(ctx) 597 if err != nil { 598 cancel() 599 return nil, errors.Trace(err) 600 } 601 defer conn.Close() 602 exec := &common.SQLWithRetry{ 603 DB: m.session, 604 Logger: log.L(), 605 } 606 err = exec.Exec(ctx, "enable pessimistic transaction", "SET SESSION tidb_txn_mode = 'pessimistic';") 607 if err != nil { 608 cancel() 609 return nil, errors.Annotate(err, "enable pessimistic transaction failed") 610 } 611 612 needSwitch := true 613 paused := false 614 var pausedCfg storedCfgs 615 err = exec.Transact(ctx, "check and pause schedulers", func(ctx context.Context, tx *sql.Tx) error { 616 query := fmt.Sprintf("SELECT task_id, pd_cfgs, status, state from %s FOR UPDATE", m.tableName) 617 rows, err := tx.QueryContext(ctx, query) 618 if err != nil { 619 return errors.Annotate(err, "fetch task meta failed") 620 } 621 closed := false 622 defer func() { 623 if !closed { 624 rows.Close() 625 } 626 }() 627 var ( 628 taskID int64 629 cfg string 630 statusValue string 631 state int 632 ) 633 var cfgStr string 634 for rows.Next() { 635 if err = rows.Scan(&taskID, &cfg, &statusValue, &state); err != nil { 636 return errors.Trace(err) 637 } 638 status, err := parseTaskMetaStatus(statusValue) 639 if err != nil { 640 return errors.Annotatef(err, "invalid task meta status '%s'", statusValue) 641 } 642 643 if status == taskMetaStatusInitial { 644 continue 645 } 646 647 if taskID == m.taskID { 648 if status >= taskMetaStatusSwitchSkipped { 649 needSwitch = false 650 return nil 651 } 652 } 653 654 if cfg != "" { 655 cfgStr = cfg 656 break 657 } 658 } 659 if err = rows.Close(); err != nil { 660 return errors.Trace(err) 661 } 662 closed = true 663 664 if cfgStr != "" { 665 err = json.Unmarshal([]byte(cfgStr), &pausedCfg) 666 return errors.Trace(err) 667 } 668 669 orig, removed, err := m.pd.RemoveSchedulersWithOrigin(pauseCtx) 670 if err != nil { 671 return errors.Trace(err) 672 } 673 paused = true 674 675 pausedCfg = storedCfgs{PauseCfg: removed, RestoreCfg: orig} 676 jsonByts, err := json.Marshal(&pausedCfg) 677 if err != nil { 678 return errors.Trace(err) 679 } 680 681 query = fmt.Sprintf("update %s set pd_cfgs = ?, status = ? where task_id = ?", m.tableName) 682 _, err = tx.ExecContext(ctx, query, string(jsonByts), taskMetaStatusScheduleSet.String(), m.taskID) 683 684 return errors.Annotate(err, "update task pd configs failed") 685 }) 686 if err != nil { 687 cancel() 688 return nil, err 689 } 690 691 if !needSwitch { 692 cancel() 693 return nil, nil 694 } 695 696 if !paused { 697 if err = m.pd.RemoveSchedulersWithCfg(pauseCtx, pausedCfg.PauseCfg); err != nil { 698 cancel() 699 return nil, err 700 } 701 } 702 703 cancelFunc := m.pd.MakeUndoFunctionByConfig(pausedCfg.RestoreCfg) 704 705 return func(ctx context.Context) error { 706 // close the periodic task ctx 707 cancel() 708 return cancelFunc(ctx) 709 }, nil 710 } 711 712 // CheckAndFinishRestore check task meta and return whether to switch cluster to normal state and clean up the metadata 713 // Return values: first boolean indicates whether switch back tidb cluster to normal state (restore schedulers, switch tikv to normal) 714 // the second boolean indicates whether to clean up the metadata in tidb 715 func (m *dbTaskMetaMgr) CheckAndFinishRestore(ctx context.Context, finished bool) (bool, bool, error) { 716 conn, err := m.session.Conn(ctx) 717 if err != nil { 718 return false, false, errors.Trace(err) 719 } 720 defer conn.Close() 721 exec := &common.SQLWithRetry{ 722 DB: m.session, 723 Logger: log.L(), 724 } 725 err = exec.Exec(ctx, "enable pessimistic transaction", "SET SESSION tidb_txn_mode = 'pessimistic';") 726 if err != nil { 727 return false, false, errors.Annotate(err, "enable pessimistic transaction failed") 728 } 729 730 switchBack := true 731 allFinished := finished 732 err = exec.Transact(ctx, "check and finish schedulers", func(ctx context.Context, tx *sql.Tx) error { 733 query := fmt.Sprintf("SELECT task_id, status, state from %s FOR UPDATE", m.tableName) 734 rows, err := tx.QueryContext(ctx, query) 735 if err != nil { 736 return errors.Annotate(err, "fetch task meta failed") 737 } 738 closed := false 739 defer func() { 740 if !closed { 741 rows.Close() 742 } 743 }() 744 var ( 745 taskID int64 746 statusValue string 747 state int 748 ) 749 750 taskStatus := taskMetaStatusInitial 751 for rows.Next() { 752 if err = rows.Scan(&taskID, &statusValue, &state); err != nil { 753 return errors.Trace(err) 754 } 755 status, err := parseTaskMetaStatus(statusValue) 756 if err != nil { 757 return errors.Annotatef(err, "invalid task meta status '%s'", statusValue) 758 } 759 760 if taskID == m.taskID { 761 taskStatus = status 762 continue 763 } 764 765 if status < taskMetaStatusSwitchSkipped { 766 allFinished = false 767 // check if other task still running 768 if state == taskStateNormal { 769 log.L().Info("unfinished task found", zap.Int64("task_id", taskID), 770 zap.Stringer("status", status)) 771 switchBack = false 772 } 773 } 774 } 775 if err = rows.Close(); err != nil { 776 return errors.Trace(err) 777 } 778 closed = true 779 780 if taskStatus < taskMetaStatusSwitchSkipped { 781 newStatus := taskMetaStatusSwitchBack 782 newState := taskStateNormal 783 if !finished { 784 newStatus = taskStatus 785 newState = taskStateExited 786 } else if !allFinished { 787 newStatus = taskMetaStatusSwitchSkipped 788 } 789 790 query = fmt.Sprintf("update %s set status = ?, state = ? where task_id = ?", m.tableName) 791 if _, err = tx.ExecContext(ctx, query, newStatus.String(), newState, m.taskID); err != nil { 792 return errors.Trace(err) 793 } 794 } 795 796 return nil 797 }) 798 log.L().Info("check all task finish status", zap.Bool("task_finished", finished), 799 zap.Bool("all_finished", allFinished), zap.Bool("switch_back", switchBack)) 800 801 return switchBack, allFinished, err 802 } 803 804 func (m *dbTaskMetaMgr) Cleanup(ctx context.Context) error { 805 exec := &common.SQLWithRetry{ 806 DB: m.session, 807 Logger: log.L(), 808 } 809 // avoid override existing metadata if the meta is already inserted. 810 stmt := fmt.Sprintf("DROP TABLE %s;", m.tableName) 811 if err := exec.Exec(ctx, "cleanup task meta tables", stmt); err != nil { 812 return errors.Trace(err) 813 } 814 return nil 815 } 816 817 func (m *dbTaskMetaMgr) CleanupTask(ctx context.Context) error { 818 exec := &common.SQLWithRetry{ 819 DB: m.session, 820 Logger: log.L(), 821 } 822 stmt := fmt.Sprintf("DELETE FROM %s WHERE task_id = %d;", m.tableName, m.taskID) 823 err := exec.Exec(ctx, "clean up task", stmt) 824 return errors.Trace(err) 825 } 826 827 func (m *dbTaskMetaMgr) Close() { 828 m.pd.Close() 829 } 830 831 func (m *dbTaskMetaMgr) CleanupAllMetas(ctx context.Context) error { 832 exec := &common.SQLWithRetry{ 833 DB: m.session, 834 Logger: log.L(), 835 } 836 837 // check if all tables are finished 838 query := fmt.Sprintf("SELECT COUNT(*) from %s", common.UniqueTable(m.schemaName, tableMetaTableName)) 839 var cnt int 840 if err := exec.QueryRow(ctx, "fetch table meta row count", query, &cnt); err != nil { 841 return errors.Trace(err) 842 } 843 if cnt > 0 { 844 log.L().Warn("there are unfinished table in table meta table, cleanup skipped.") 845 return nil 846 } 847 848 // avoid override existing metadata if the meta is already inserted. 849 stmt := fmt.Sprintf("DROP DATABASE %s;", common.EscapeIdentifier(m.schemaName)) 850 if err := exec.Exec(ctx, "cleanup task meta tables", stmt); err != nil { 851 return errors.Trace(err) 852 } 853 return nil 854 } 855 856 type noopMetaMgrBuilder struct{} 857 858 func (b noopMetaMgrBuilder) Init(ctx context.Context) error { 859 return nil 860 } 861 862 func (b noopMetaMgrBuilder) TaskMetaMgr(pd *pdutil.PdController) taskMetaMgr { 863 return noopTaskMetaMgr{} 864 } 865 866 func (b noopMetaMgrBuilder) TableMetaMgr(tr *TableRestore) tableMetaMgr { 867 return noopTableMetaMgr{} 868 } 869 870 type noopTaskMetaMgr struct{} 871 872 func (m noopTaskMetaMgr) InitTask(ctx context.Context, source int64) error { 873 return nil 874 } 875 876 func (m noopTaskMetaMgr) CheckAndPausePdSchedulers(ctx context.Context) (pdutil.UndoFunc, error) { 877 return func(ctx context.Context) error { 878 return nil 879 }, nil 880 } 881 882 func (m noopTaskMetaMgr) CheckTaskExist(ctx context.Context) (bool, error) { 883 return false, nil 884 } 885 886 func (m noopTaskMetaMgr) CheckClusterSource(ctx context.Context) (int64, error) { 887 return 0, nil 888 } 889 890 func (m noopTaskMetaMgr) CheckAndFinishRestore(context.Context, bool) (bool, bool, error) { 891 return false, true, nil 892 } 893 894 func (m noopTaskMetaMgr) Cleanup(ctx context.Context) error { 895 return nil 896 } 897 898 func (m noopTaskMetaMgr) CleanupTask(ctx context.Context) error { 899 return nil 900 } 901 902 func (m noopTaskMetaMgr) CleanupAllMetas(ctx context.Context) error { 903 return nil 904 } 905 906 func (m noopTaskMetaMgr) Close() { 907 } 908 909 type noopTableMetaMgr struct{} 910 911 func (m noopTableMetaMgr) InitTableMeta(ctx context.Context) error { 912 return nil 913 } 914 915 func (m noopTableMetaMgr) AllocTableRowIDs(ctx context.Context, rawRowIDMax int64) (*verify.KVChecksum, int64, error) { 916 return nil, 0, nil 917 } 918 919 func (m noopTableMetaMgr) UpdateTableStatus(ctx context.Context, status metaStatus) error { 920 return nil 921 } 922 923 func (m noopTableMetaMgr) UpdateTableBaseChecksum(ctx context.Context, checksum *verify.KVChecksum) error { 924 return nil 925 } 926 927 func (m noopTableMetaMgr) CheckAndUpdateLocalChecksum(ctx context.Context, checksum *verify.KVChecksum) (bool, *verify.KVChecksum, error) { 928 return false, nil, nil 929 } 930 931 func (m noopTableMetaMgr) FinishTable(ctx context.Context) error { 932 return nil 933 }