github.com/pingcap/br@v5.3.0-alpha.0.20220125034240-ec59c7b6ce30+incompatible/pkg/lightning/restore/meta_manager.go (about)

     1  // Copyright 2021 PingCAP, Inc. Licensed under Apache-2.0.
     2  
     3  package restore
     4  
     5  import (
     6  	"context"
     7  	"database/sql"
     8  	"encoding/json"
     9  	"fmt"
    10  	"strings"
    11  
    12  	"github.com/pingcap/errors"
    13  	"github.com/pingcap/parser/model"
    14  	"github.com/pingcap/parser/mysql"
    15  	"go.uber.org/zap"
    16  
    17  	"github.com/pingcap/br/pkg/lightning/backend/tidb"
    18  	"github.com/pingcap/br/pkg/lightning/common"
    19  	"github.com/pingcap/br/pkg/lightning/log"
    20  	verify "github.com/pingcap/br/pkg/lightning/verification"
    21  	"github.com/pingcap/br/pkg/pdutil"
    22  	"github.com/pingcap/br/pkg/redact"
    23  )
    24  
    25  type metaMgrBuilder interface {
    26  	Init(ctx context.Context) error
    27  	TaskMetaMgr(pd *pdutil.PdController) taskMetaMgr
    28  	TableMetaMgr(tr *TableRestore) tableMetaMgr
    29  }
    30  
    31  type dbMetaMgrBuilder struct {
    32  	db           *sql.DB
    33  	taskID       int64
    34  	schema       string
    35  	needChecksum bool
    36  }
    37  
    38  func (b *dbMetaMgrBuilder) Init(ctx context.Context) error {
    39  	exec := common.SQLWithRetry{
    40  		DB:           b.db,
    41  		Logger:       log.L(),
    42  		HideQueryLog: redact.NeedRedact(),
    43  	}
    44  	metaDBSQL := fmt.Sprintf("CREATE DATABASE IF NOT EXISTS %s", common.EscapeIdentifier(b.schema))
    45  	if err := exec.Exec(ctx, "create meta schema", metaDBSQL); err != nil {
    46  		return errors.Annotate(err, "create meta schema failed")
    47  	}
    48  	taskMetaSQL := fmt.Sprintf(CreateTaskMetaTable, common.UniqueTable(b.schema, taskMetaTableName))
    49  	if err := exec.Exec(ctx, "create meta table", taskMetaSQL); err != nil {
    50  		return errors.Annotate(err, "create task meta table failed")
    51  	}
    52  	tableMetaSQL := fmt.Sprintf(CreateTableMetadataTable, common.UniqueTable(b.schema, tableMetaTableName))
    53  	if err := exec.Exec(ctx, "create meta table", tableMetaSQL); err != nil {
    54  		return errors.Annotate(err, "create table meta table failed")
    55  	}
    56  	return nil
    57  }
    58  
    59  func (b *dbMetaMgrBuilder) TaskMetaMgr(pd *pdutil.PdController) taskMetaMgr {
    60  	return &dbTaskMetaMgr{
    61  		session:    b.db,
    62  		taskID:     b.taskID,
    63  		pd:         pd,
    64  		tableName:  common.UniqueTable(b.schema, taskMetaTableName),
    65  		schemaName: b.schema,
    66  	}
    67  }
    68  
    69  func (b *dbMetaMgrBuilder) TableMetaMgr(tr *TableRestore) tableMetaMgr {
    70  	return &dbTableMetaMgr{
    71  		session:      b.db,
    72  		taskID:       b.taskID,
    73  		tr:           tr,
    74  		tableName:    common.UniqueTable(b.schema, tableMetaTableName),
    75  		needChecksum: b.needChecksum,
    76  	}
    77  }
    78  
    79  type tableMetaMgr interface {
    80  	InitTableMeta(ctx context.Context) error
    81  	AllocTableRowIDs(ctx context.Context, rawRowIDMax int64) (*verify.KVChecksum, int64, error)
    82  	UpdateTableStatus(ctx context.Context, status metaStatus) error
    83  	UpdateTableBaseChecksum(ctx context.Context, checksum *verify.KVChecksum) error
    84  	CheckAndUpdateLocalChecksum(ctx context.Context, checksum *verify.KVChecksum) (bool, *verify.KVChecksum, error)
    85  	FinishTable(ctx context.Context) error
    86  }
    87  
    88  type dbTableMetaMgr struct {
    89  	session      *sql.DB
    90  	taskID       int64
    91  	tr           *TableRestore
    92  	tableName    string
    93  	needChecksum bool
    94  }
    95  
    96  func (m *dbTableMetaMgr) InitTableMeta(ctx context.Context) error {
    97  	exec := &common.SQLWithRetry{
    98  		DB:     m.session,
    99  		Logger: m.tr.logger,
   100  	}
   101  	// avoid override existing metadata if the meta is already inserted.
   102  	stmt := fmt.Sprintf(`INSERT IGNORE INTO %s (task_id, table_id, table_name, status) values (?, ?, ?, ?)`, m.tableName)
   103  	task := m.tr.logger.Begin(zap.DebugLevel, "init table meta")
   104  	err := exec.Exec(ctx, "init table meta", stmt, m.taskID, m.tr.tableInfo.ID, m.tr.tableName, metaStatusInitial.String())
   105  	task.End(zap.ErrorLevel, err)
   106  	return errors.Trace(err)
   107  }
   108  
   109  type metaStatus uint32
   110  
   111  const (
   112  	metaStatusInitial metaStatus = iota
   113  	metaStatusRowIDAllocated
   114  	metaStatusRestoreStarted
   115  	metaStatusRestoreFinished
   116  	metaStatusChecksuming
   117  	metaStatusChecksumSkipped
   118  	metaStatusFinished
   119  )
   120  
   121  func (m metaStatus) String() string {
   122  	switch m {
   123  	case metaStatusInitial:
   124  		return "initialized"
   125  	case metaStatusRowIDAllocated:
   126  		return "allocated"
   127  	case metaStatusRestoreStarted:
   128  		return "restore"
   129  	case metaStatusRestoreFinished:
   130  		return "restore_finished"
   131  	case metaStatusChecksuming:
   132  		return "checksuming"
   133  	case metaStatusChecksumSkipped:
   134  		return "checksum_skipped"
   135  	case metaStatusFinished:
   136  		return "finish"
   137  	default:
   138  		panic(fmt.Sprintf("unexpected metaStatus value '%d'", m))
   139  	}
   140  }
   141  
   142  func parseMetaStatus(s string) (metaStatus, error) {
   143  	switch s {
   144  	case "", "initialized":
   145  		return metaStatusInitial, nil
   146  	case "allocated":
   147  		return metaStatusRowIDAllocated, nil
   148  	case "restore":
   149  		return metaStatusRestoreStarted, nil
   150  	case "restore_finished":
   151  		return metaStatusRestoreFinished, nil
   152  	case "checksuming":
   153  		return metaStatusChecksuming, nil
   154  	case "checksum_skipped":
   155  		return metaStatusChecksumSkipped, nil
   156  	case "finish":
   157  		return metaStatusFinished, nil
   158  	default:
   159  		return metaStatusInitial, errors.Errorf("invalid meta status '%s'", s)
   160  	}
   161  }
   162  
   163  func (m *dbTableMetaMgr) AllocTableRowIDs(ctx context.Context, rawRowIDMax int64) (*verify.KVChecksum, int64, error) {
   164  	conn, err := m.session.Conn(ctx)
   165  	if err != nil {
   166  		return nil, 0, errors.Trace(err)
   167  	}
   168  	defer conn.Close()
   169  	exec := &common.SQLWithRetry{
   170  		DB:     m.session,
   171  		Logger: m.tr.logger,
   172  	}
   173  	var newRowIDBase, newRowIDMax int64
   174  	curStatus := metaStatusInitial
   175  	newStatus := metaStatusRowIDAllocated
   176  	var baseTotalKvs, baseTotalBytes, baseChecksum uint64
   177  	err = exec.Exec(ctx, "enable pessimistic transaction", "SET SESSION tidb_txn_mode = 'pessimistic';")
   178  	if err != nil {
   179  		return nil, 0, errors.Annotate(err, "enable pessimistic transaction failed")
   180  	}
   181  	needAutoID := common.TableHasAutoRowID(m.tr.tableInfo.Core) || m.tr.tableInfo.Core.GetAutoIncrementColInfo() != nil || m.tr.tableInfo.Core.ContainsAutoRandomBits()
   182  	err = exec.Transact(ctx, "init table allocator base", func(ctx context.Context, tx *sql.Tx) error {
   183  		query := fmt.Sprintf("SELECT task_id, row_id_base, row_id_max, total_kvs_base, total_bytes_base, checksum_base, status from %s WHERE table_id = ? FOR UPDATE", m.tableName)
   184  		rows, err := tx.QueryContext(ctx, query, m.tr.tableInfo.ID)
   185  		if err != nil {
   186  			return errors.Trace(err)
   187  		}
   188  		defer rows.Close()
   189  		var (
   190  			metaTaskID, rowIDBase, rowIDMax, maxRowIDMax int64
   191  			totalKvs, totalBytes, checksum               uint64
   192  			statusValue                                  string
   193  		)
   194  		for rows.Next() {
   195  			if err = rows.Scan(&metaTaskID, &rowIDBase, &rowIDMax, &totalKvs, &totalBytes, &checksum, &statusValue); err != nil {
   196  				return errors.Trace(err)
   197  			}
   198  			status, err := parseMetaStatus(statusValue)
   199  			if err != nil {
   200  				return errors.Annotatef(err, "invalid meta status '%s'", statusValue)
   201  			}
   202  
   203  			// skip finished meta
   204  			if status >= metaStatusFinished {
   205  				continue
   206  			}
   207  
   208  			if status == metaStatusChecksuming {
   209  				return errors.New("target table is calculating checksum, please wait unit the checksum is finished and try again.")
   210  			}
   211  
   212  			if metaTaskID == m.taskID {
   213  				curStatus = status
   214  				baseChecksum = checksum
   215  				baseTotalKvs = totalKvs
   216  				baseTotalBytes = totalBytes
   217  				if status >= metaStatusRowIDAllocated {
   218  					if rowIDMax-rowIDBase != rawRowIDMax {
   219  						return errors.Errorf("verify allocator base failed. local: '%d', meta: '%d'", rawRowIDMax, rowIDMax-rowIDBase)
   220  					}
   221  					newRowIDBase = rowIDBase
   222  					newRowIDMax = rowIDMax
   223  					break
   224  				}
   225  				continue
   226  			}
   227  
   228  			// other tasks has finished this logic, we needn't do again.
   229  			if status >= metaStatusRowIDAllocated {
   230  				newStatus = metaStatusRestoreStarted
   231  			}
   232  
   233  			if rowIDMax > maxRowIDMax {
   234  				maxRowIDMax = rowIDMax
   235  			}
   236  		}
   237  
   238  		// no enough info are available, fetch row_id max for table
   239  		if curStatus == metaStatusInitial {
   240  			if needAutoID && maxRowIDMax == 0 {
   241  				// NOTE: currently, if a table contains auto_incremental unique key and _tidb_rowid,
   242  				// the `show table next_row_id` will returns the unique key field only.
   243  				var autoIDField string
   244  				for _, col := range m.tr.tableInfo.Core.Columns {
   245  					if mysql.HasAutoIncrementFlag(col.Flag) {
   246  						autoIDField = col.Name.L
   247  						break
   248  					} else if mysql.HasPriKeyFlag(col.Flag) && m.tr.tableInfo.Core.AutoRandomBits > 0 {
   249  						autoIDField = col.Name.L
   250  						break
   251  					}
   252  				}
   253  				if len(autoIDField) == 0 && common.TableHasAutoRowID(m.tr.tableInfo.Core) {
   254  					autoIDField = model.ExtraHandleName.L
   255  				}
   256  				if len(autoIDField) == 0 {
   257  					return errors.Errorf("table %s contains auto increment id or _tidb_rowid, but target field not found", m.tr.tableName)
   258  				}
   259  
   260  				autoIDInfos, err := tidb.FetchTableAutoIDInfos(ctx, tx, m.tr.tableName)
   261  				if err != nil {
   262  					return errors.Trace(err)
   263  				}
   264  				found := false
   265  				for _, info := range autoIDInfos {
   266  					if strings.ToLower(info.Column) == autoIDField {
   267  						maxRowIDMax = info.NextID - 1
   268  						found = true
   269  						break
   270  					}
   271  				}
   272  				if !found {
   273  					return errors.Errorf("can't fetch previous auto id base for table %s field '%s'", m.tr.tableName, autoIDField)
   274  				}
   275  			}
   276  			newRowIDBase = maxRowIDMax
   277  			newRowIDMax = newRowIDBase + rawRowIDMax
   278  			// table contains no data, can skip checksum
   279  			if needAutoID && newRowIDBase == 0 && newStatus < metaStatusRestoreStarted {
   280  				newStatus = metaStatusRestoreStarted
   281  			}
   282  			query = fmt.Sprintf("update %s set row_id_base = ?, row_id_max = ?, status = ? where table_id = ? and task_id = ?", m.tableName)
   283  			_, err := tx.ExecContext(ctx, query, newRowIDBase, newRowIDMax, newStatus.String(), m.tr.tableInfo.ID, m.taskID)
   284  			if err != nil {
   285  				return errors.Trace(err)
   286  			}
   287  
   288  			curStatus = newStatus
   289  		}
   290  		return nil
   291  	})
   292  	if err != nil {
   293  		return nil, 0, errors.Trace(err)
   294  	}
   295  
   296  	var checksum *verify.KVChecksum
   297  	// need to do checksum and update checksum meta since we are the first one.
   298  	if curStatus < metaStatusRestoreStarted {
   299  		// table contains data but haven't do checksum yet
   300  		if (newRowIDBase > 0 || !needAutoID) && m.needChecksum && baseTotalKvs == 0 {
   301  			remoteCk, err := DoChecksum(ctx, m.tr.tableInfo)
   302  			if err != nil {
   303  				return nil, 0, errors.Trace(err)
   304  			}
   305  
   306  			if remoteCk.Checksum != baseChecksum || remoteCk.TotalKVs != baseTotalKvs || remoteCk.TotalBytes != baseTotalBytes {
   307  				ck := verify.MakeKVChecksum(remoteCk.TotalBytes, remoteCk.TotalKVs, remoteCk.Checksum)
   308  				checksum = &ck
   309  			}
   310  
   311  		}
   312  
   313  		if checksum != nil {
   314  			if err = m.UpdateTableBaseChecksum(ctx, checksum); err != nil {
   315  				return nil, 0, errors.Trace(err)
   316  			}
   317  
   318  			m.tr.logger.Info("checksum before restore table", zap.Object("checksum", checksum))
   319  		} else if err = m.UpdateTableStatus(ctx, metaStatusRestoreStarted); err != nil {
   320  			return nil, 0, errors.Trace(err)
   321  		}
   322  	}
   323  	if checksum == nil && baseTotalKvs > 0 {
   324  		ck := verify.MakeKVChecksum(baseTotalBytes, baseTotalKvs, baseChecksum)
   325  		checksum = &ck
   326  	}
   327  	log.L().Info("allocate table row_id base", zap.String("table", m.tr.tableName),
   328  		zap.Int64("row_id_base", newRowIDBase))
   329  	if checksum != nil {
   330  		log.L().Info("checksum base", zap.Any("checksum", checksum))
   331  	}
   332  	return checksum, newRowIDBase, nil
   333  }
   334  
   335  func (m *dbTableMetaMgr) UpdateTableBaseChecksum(ctx context.Context, checksum *verify.KVChecksum) error {
   336  	exec := &common.SQLWithRetry{
   337  		DB:     m.session,
   338  		Logger: m.tr.logger,
   339  	}
   340  	query := fmt.Sprintf("update %s set total_kvs_base = ?, total_bytes_base = ?, checksum_base = ?, status = ? where table_id = ? and task_id = ?", m.tableName)
   341  
   342  	return exec.Exec(ctx, "update base checksum", query, checksum.SumKVS(),
   343  		checksum.SumSize(), checksum.Sum(), metaStatusRestoreStarted.String(), m.tr.tableInfo.ID, m.taskID)
   344  }
   345  
   346  func (m *dbTableMetaMgr) UpdateTableStatus(ctx context.Context, status metaStatus) error {
   347  	exec := &common.SQLWithRetry{
   348  		DB:     m.session,
   349  		Logger: m.tr.logger,
   350  	}
   351  	query := fmt.Sprintf("update %s set status = ? where table_id = ? and task_id = ?", m.tableName)
   352  	return exec.Exec(ctx, "update meta status", query, status.String(), m.tr.tableInfo.ID, m.taskID)
   353  }
   354  
   355  func (m *dbTableMetaMgr) CheckAndUpdateLocalChecksum(ctx context.Context, checksum *verify.KVChecksum) (bool, *verify.KVChecksum, error) {
   356  	conn, err := m.session.Conn(ctx)
   357  	if err != nil {
   358  		return false, nil, errors.Trace(err)
   359  	}
   360  	defer conn.Close()
   361  	exec := &common.SQLWithRetry{
   362  		DB:     m.session,
   363  		Logger: m.tr.logger,
   364  	}
   365  	err = exec.Exec(ctx, "enable pessimistic transaction", "SET SESSION tidb_txn_mode = 'pessimistic';")
   366  	if err != nil {
   367  		return false, nil, errors.Annotate(err, "enable pessimistic transaction failed")
   368  	}
   369  	var (
   370  		baseTotalKvs, baseTotalBytes, baseChecksum uint64
   371  		taskKvs, taskBytes, taskChecksum           uint64
   372  		totalKvs, totalBytes, totalChecksum        uint64
   373  	)
   374  	newStatus := metaStatusChecksuming
   375  	needChecksum := true
   376  	err = exec.Transact(ctx, "checksum pre-check", func(ctx context.Context, tx *sql.Tx) error {
   377  		query := fmt.Sprintf("SELECT task_id, total_kvs_base, total_bytes_base, checksum_base, total_kvs, total_bytes, checksum, status from %s WHERE table_id = ? FOR UPDATE", m.tableName)
   378  		rows, err := tx.QueryContext(ctx, query, m.tr.tableInfo.ID)
   379  		if err != nil {
   380  			return errors.Annotate(err, "fetch task meta failed")
   381  		}
   382  		closed := false
   383  		defer func() {
   384  			if !closed {
   385  				rows.Close()
   386  			}
   387  		}()
   388  		var (
   389  			taskID      int64
   390  			statusValue string
   391  		)
   392  		for rows.Next() {
   393  			if err = rows.Scan(&taskID, &baseTotalKvs, &baseTotalBytes, &baseChecksum, &taskKvs, &taskBytes, &taskChecksum, &statusValue); err != nil {
   394  				return errors.Trace(err)
   395  			}
   396  			status, err := parseMetaStatus(statusValue)
   397  			if err != nil {
   398  				return errors.Annotatef(err, "invalid meta status '%s'", statusValue)
   399  			}
   400  
   401  			// skip finished meta
   402  			if status >= metaStatusFinished {
   403  				continue
   404  			}
   405  
   406  			if taskID == m.taskID {
   407  				if status >= metaStatusChecksuming {
   408  					newStatus = status
   409  					needChecksum = status == metaStatusChecksuming
   410  					return nil
   411  				}
   412  
   413  				continue
   414  			}
   415  
   416  			if status < metaStatusChecksuming {
   417  				newStatus = metaStatusChecksumSkipped
   418  				needChecksum = false
   419  				break
   420  			} else if status == metaStatusChecksuming {
   421  				return errors.New("another task is checksuming, there must be something wrong!")
   422  			}
   423  
   424  			totalBytes += baseTotalBytes
   425  			totalKvs += baseTotalKvs
   426  			totalChecksum ^= baseChecksum
   427  
   428  			totalBytes += taskBytes
   429  			totalKvs += taskKvs
   430  			totalChecksum ^= taskChecksum
   431  		}
   432  		rows.Close()
   433  		closed = true
   434  
   435  		query = fmt.Sprintf("update %s set total_kvs = ?, total_bytes = ?, checksum = ?, status = ? where table_id = ? and task_id = ?", m.tableName)
   436  		_, err = tx.ExecContext(ctx, query, checksum.SumKVS(), checksum.SumSize(), checksum.Sum(), newStatus.String(), m.tr.tableInfo.ID, m.taskID)
   437  		return errors.Annotate(err, "update local checksum failed")
   438  	})
   439  	if err != nil {
   440  		return false, nil, err
   441  	}
   442  
   443  	var remoteChecksum *verify.KVChecksum
   444  	if needChecksum {
   445  		ck := verify.MakeKVChecksum(totalBytes, totalKvs, totalChecksum)
   446  		remoteChecksum = &ck
   447  	}
   448  	log.L().Info("check table checksum", zap.String("table", m.tr.tableName),
   449  		zap.Bool("checksum", needChecksum), zap.String("new_status", newStatus.String()))
   450  	return needChecksum, remoteChecksum, nil
   451  }
   452  
   453  func (m *dbTableMetaMgr) FinishTable(ctx context.Context) error {
   454  	exec := &common.SQLWithRetry{
   455  		DB:     m.session,
   456  		Logger: m.tr.logger,
   457  	}
   458  	query := fmt.Sprintf("DELETE FROM %s where table_id = ? and (status = 'checksuming' or status = 'checksum_skipped')", m.tableName)
   459  	return exec.Exec(ctx, "clean up metas", query, m.tr.tableInfo.ID)
   460  }
   461  
   462  type taskMetaMgr interface {
   463  	InitTask(ctx context.Context, source int64) error
   464  	CheckClusterSource(ctx context.Context) (int64, error)
   465  	CheckTaskExist(ctx context.Context) (bool, error)
   466  	CheckAndPausePdSchedulers(ctx context.Context) (pdutil.UndoFunc, error)
   467  	// CheckAndFinishRestore check task meta and return whether to switch cluster to normal state and clean up the metadata
   468  	// Return values: first boolean indicates whether switch back tidb cluster to normal state (restore schedulers, switch tikv to normal)
   469  	// the second boolean indicates whether to clean up the metadata in tidb
   470  	CheckAndFinishRestore(ctx context.Context, finished bool) (shouldSwitchBack bool, shouldCleanupMeta bool, err error)
   471  	Cleanup(ctx context.Context) error
   472  	CleanupTask(ctx context.Context) error
   473  	CleanupAllMetas(ctx context.Context) error
   474  	Close()
   475  }
   476  
   477  type dbTaskMetaMgr struct {
   478  	session *sql.DB
   479  	taskID  int64
   480  	pd      *pdutil.PdController
   481  	// unique name of task meta table
   482  	tableName  string
   483  	schemaName string
   484  }
   485  
   486  type taskMetaStatus uint32
   487  
   488  const (
   489  	taskMetaStatusInitial taskMetaStatus = iota
   490  	taskMetaStatusScheduleSet
   491  	taskMetaStatusSwitchSkipped
   492  	taskMetaStatusSwitchBack
   493  )
   494  
   495  const (
   496  	taskStateNormal int = iota
   497  	taskStateExited
   498  )
   499  
   500  func (m taskMetaStatus) String() string {
   501  	switch m {
   502  	case taskMetaStatusInitial:
   503  		return "initialized"
   504  	case taskMetaStatusScheduleSet:
   505  		return "schedule_set"
   506  	case taskMetaStatusSwitchSkipped:
   507  		return "skip_switch"
   508  	case taskMetaStatusSwitchBack:
   509  		return "switched"
   510  	default:
   511  		panic(fmt.Sprintf("unexpected metaStatus value '%d'", m))
   512  	}
   513  }
   514  
   515  func parseTaskMetaStatus(s string) (taskMetaStatus, error) {
   516  	switch s {
   517  	case "", "initialized":
   518  		return taskMetaStatusInitial, nil
   519  	case "schedule_set":
   520  		return taskMetaStatusScheduleSet, nil
   521  	case "skip_switch":
   522  		return taskMetaStatusSwitchSkipped, nil
   523  	case "switched":
   524  		return taskMetaStatusSwitchBack, nil
   525  	default:
   526  		return taskMetaStatusInitial, errors.Errorf("invalid meta status '%s'", s)
   527  	}
   528  }
   529  
   530  type storedCfgs struct {
   531  	PauseCfg   pdutil.ClusterConfig `json:"paused"`
   532  	RestoreCfg pdutil.ClusterConfig `json:"restore"`
   533  }
   534  
   535  func (m *dbTaskMetaMgr) InitTask(ctx context.Context, source int64) error {
   536  	exec := &common.SQLWithRetry{
   537  		DB:     m.session,
   538  		Logger: log.L(),
   539  	}
   540  	// avoid override existing metadata if the meta is already inserted.
   541  	stmt := fmt.Sprintf(`INSERT INTO %s (task_id, status, source_bytes) values (?, ?, ?) ON DUPLICATE KEY UPDATE state = ?`, m.tableName)
   542  	err := exec.Exec(ctx, "init task meta", stmt, m.taskID, taskMetaStatusInitial.String(), source, taskStateNormal)
   543  	return errors.Trace(err)
   544  }
   545  
   546  func (m *dbTaskMetaMgr) CheckTaskExist(ctx context.Context) (bool, error) {
   547  	exec := &common.SQLWithRetry{
   548  		DB:     m.session,
   549  		Logger: log.L(),
   550  	}
   551  	// avoid override existing metadata if the meta is already inserted.
   552  	exist := false
   553  	err := exec.Transact(ctx, "check whether this task has started before", func(ctx context.Context, tx *sql.Tx) error {
   554  		query := fmt.Sprintf("SELECT task_id from %s WHERE task_id = %d", m.tableName, m.taskID)
   555  		rows, err := tx.QueryContext(ctx, query)
   556  		if err != nil {
   557  			return errors.Annotate(err, "fetch task meta failed")
   558  		}
   559  		var taskID int64
   560  		for rows.Next() {
   561  			if err = rows.Scan(&taskID); err != nil {
   562  				rows.Close()
   563  				return errors.Trace(err)
   564  			}
   565  			if taskID == m.taskID {
   566  				exist = true
   567  			}
   568  		}
   569  		err = rows.Close()
   570  		return errors.Trace(err)
   571  	})
   572  	return exist, errors.Trace(err)
   573  }
   574  
   575  func (m *dbTaskMetaMgr) CheckClusterSource(ctx context.Context) (int64, error) {
   576  	conn, err := m.session.Conn(ctx)
   577  	if err != nil {
   578  		return 0, errors.Trace(err)
   579  	}
   580  	defer conn.Close()
   581  	exec := &common.SQLWithRetry{
   582  		DB:     m.session,
   583  		Logger: log.L(),
   584  	}
   585  
   586  	source := int64(0)
   587  	query := fmt.Sprintf("SELECT SUM(source_bytes) from %s", m.tableName)
   588  	if err := exec.QueryRow(ctx, "query total source size", query, &source); err != nil {
   589  		return 0, errors.Annotate(err, "fetch task meta failed")
   590  	}
   591  	return source, nil
   592  }
   593  
   594  func (m *dbTaskMetaMgr) CheckAndPausePdSchedulers(ctx context.Context) (pdutil.UndoFunc, error) {
   595  	pauseCtx, cancel := context.WithCancel(ctx)
   596  	conn, err := m.session.Conn(ctx)
   597  	if err != nil {
   598  		cancel()
   599  		return nil, errors.Trace(err)
   600  	}
   601  	defer conn.Close()
   602  	exec := &common.SQLWithRetry{
   603  		DB:     m.session,
   604  		Logger: log.L(),
   605  	}
   606  	err = exec.Exec(ctx, "enable pessimistic transaction", "SET SESSION tidb_txn_mode = 'pessimistic';")
   607  	if err != nil {
   608  		cancel()
   609  		return nil, errors.Annotate(err, "enable pessimistic transaction failed")
   610  	}
   611  
   612  	needSwitch := true
   613  	paused := false
   614  	var pausedCfg storedCfgs
   615  	err = exec.Transact(ctx, "check and pause schedulers", func(ctx context.Context, tx *sql.Tx) error {
   616  		query := fmt.Sprintf("SELECT task_id, pd_cfgs, status, state from %s FOR UPDATE", m.tableName)
   617  		rows, err := tx.QueryContext(ctx, query)
   618  		if err != nil {
   619  			return errors.Annotate(err, "fetch task meta failed")
   620  		}
   621  		closed := false
   622  		defer func() {
   623  			if !closed {
   624  				rows.Close()
   625  			}
   626  		}()
   627  		var (
   628  			taskID      int64
   629  			cfg         string
   630  			statusValue string
   631  			state       int
   632  		)
   633  		var cfgStr string
   634  		for rows.Next() {
   635  			if err = rows.Scan(&taskID, &cfg, &statusValue, &state); err != nil {
   636  				return errors.Trace(err)
   637  			}
   638  			status, err := parseTaskMetaStatus(statusValue)
   639  			if err != nil {
   640  				return errors.Annotatef(err, "invalid task meta status '%s'", statusValue)
   641  			}
   642  
   643  			if status == taskMetaStatusInitial {
   644  				continue
   645  			}
   646  
   647  			if taskID == m.taskID {
   648  				if status >= taskMetaStatusSwitchSkipped {
   649  					needSwitch = false
   650  					return nil
   651  				}
   652  			}
   653  
   654  			if cfg != "" {
   655  				cfgStr = cfg
   656  				break
   657  			}
   658  		}
   659  		if err = rows.Close(); err != nil {
   660  			return errors.Trace(err)
   661  		}
   662  		closed = true
   663  
   664  		if cfgStr != "" {
   665  			err = json.Unmarshal([]byte(cfgStr), &pausedCfg)
   666  			return errors.Trace(err)
   667  		}
   668  
   669  		orig, removed, err := m.pd.RemoveSchedulersWithOrigin(pauseCtx)
   670  		if err != nil {
   671  			return errors.Trace(err)
   672  		}
   673  		paused = true
   674  
   675  		pausedCfg = storedCfgs{PauseCfg: removed, RestoreCfg: orig}
   676  		jsonByts, err := json.Marshal(&pausedCfg)
   677  		if err != nil {
   678  			return errors.Trace(err)
   679  		}
   680  
   681  		query = fmt.Sprintf("update %s set pd_cfgs = ?, status = ? where task_id = ?", m.tableName)
   682  		_, err = tx.ExecContext(ctx, query, string(jsonByts), taskMetaStatusScheduleSet.String(), m.taskID)
   683  
   684  		return errors.Annotate(err, "update task pd configs failed")
   685  	})
   686  	if err != nil {
   687  		cancel()
   688  		return nil, err
   689  	}
   690  
   691  	if !needSwitch {
   692  		cancel()
   693  		return nil, nil
   694  	}
   695  
   696  	if !paused {
   697  		if err = m.pd.RemoveSchedulersWithCfg(pauseCtx, pausedCfg.PauseCfg); err != nil {
   698  			cancel()
   699  			return nil, err
   700  		}
   701  	}
   702  
   703  	cancelFunc := m.pd.MakeUndoFunctionByConfig(pausedCfg.RestoreCfg)
   704  
   705  	return func(ctx context.Context) error {
   706  		// close the periodic task ctx
   707  		cancel()
   708  		return cancelFunc(ctx)
   709  	}, nil
   710  }
   711  
   712  // CheckAndFinishRestore check task meta and return whether to switch cluster to normal state and clean up the metadata
   713  // Return values: first boolean indicates whether switch back tidb cluster to normal state (restore schedulers, switch tikv to normal)
   714  // the second boolean indicates whether to clean up the metadata in tidb
   715  func (m *dbTaskMetaMgr) CheckAndFinishRestore(ctx context.Context, finished bool) (bool, bool, error) {
   716  	conn, err := m.session.Conn(ctx)
   717  	if err != nil {
   718  		return false, false, errors.Trace(err)
   719  	}
   720  	defer conn.Close()
   721  	exec := &common.SQLWithRetry{
   722  		DB:     m.session,
   723  		Logger: log.L(),
   724  	}
   725  	err = exec.Exec(ctx, "enable pessimistic transaction", "SET SESSION tidb_txn_mode = 'pessimistic';")
   726  	if err != nil {
   727  		return false, false, errors.Annotate(err, "enable pessimistic transaction failed")
   728  	}
   729  
   730  	switchBack := true
   731  	allFinished := finished
   732  	err = exec.Transact(ctx, "check and finish schedulers", func(ctx context.Context, tx *sql.Tx) error {
   733  		query := fmt.Sprintf("SELECT task_id, status, state from %s FOR UPDATE", m.tableName)
   734  		rows, err := tx.QueryContext(ctx, query)
   735  		if err != nil {
   736  			return errors.Annotate(err, "fetch task meta failed")
   737  		}
   738  		closed := false
   739  		defer func() {
   740  			if !closed {
   741  				rows.Close()
   742  			}
   743  		}()
   744  		var (
   745  			taskID      int64
   746  			statusValue string
   747  			state       int
   748  		)
   749  
   750  		taskStatus := taskMetaStatusInitial
   751  		for rows.Next() {
   752  			if err = rows.Scan(&taskID, &statusValue, &state); err != nil {
   753  				return errors.Trace(err)
   754  			}
   755  			status, err := parseTaskMetaStatus(statusValue)
   756  			if err != nil {
   757  				return errors.Annotatef(err, "invalid task meta status '%s'", statusValue)
   758  			}
   759  
   760  			if taskID == m.taskID {
   761  				taskStatus = status
   762  				continue
   763  			}
   764  
   765  			if status < taskMetaStatusSwitchSkipped {
   766  				allFinished = false
   767  				// check if other task still running
   768  				if state == taskStateNormal {
   769  					log.L().Info("unfinished task found", zap.Int64("task_id", taskID),
   770  						zap.Stringer("status", status))
   771  					switchBack = false
   772  				}
   773  			}
   774  		}
   775  		if err = rows.Close(); err != nil {
   776  			return errors.Trace(err)
   777  		}
   778  		closed = true
   779  
   780  		if taskStatus < taskMetaStatusSwitchSkipped {
   781  			newStatus := taskMetaStatusSwitchBack
   782  			newState := taskStateNormal
   783  			if !finished {
   784  				newStatus = taskStatus
   785  				newState = taskStateExited
   786  			} else if !allFinished {
   787  				newStatus = taskMetaStatusSwitchSkipped
   788  			}
   789  
   790  			query = fmt.Sprintf("update %s set status = ?, state = ? where task_id = ?", m.tableName)
   791  			if _, err = tx.ExecContext(ctx, query, newStatus.String(), newState, m.taskID); err != nil {
   792  				return errors.Trace(err)
   793  			}
   794  		}
   795  
   796  		return nil
   797  	})
   798  	log.L().Info("check all task finish status", zap.Bool("task_finished", finished),
   799  		zap.Bool("all_finished", allFinished), zap.Bool("switch_back", switchBack))
   800  
   801  	return switchBack, allFinished, err
   802  }
   803  
   804  func (m *dbTaskMetaMgr) Cleanup(ctx context.Context) error {
   805  	exec := &common.SQLWithRetry{
   806  		DB:     m.session,
   807  		Logger: log.L(),
   808  	}
   809  	// avoid override existing metadata if the meta is already inserted.
   810  	stmt := fmt.Sprintf("DROP TABLE %s;", m.tableName)
   811  	if err := exec.Exec(ctx, "cleanup task meta tables", stmt); err != nil {
   812  		return errors.Trace(err)
   813  	}
   814  	return nil
   815  }
   816  
   817  func (m *dbTaskMetaMgr) CleanupTask(ctx context.Context) error {
   818  	exec := &common.SQLWithRetry{
   819  		DB:     m.session,
   820  		Logger: log.L(),
   821  	}
   822  	stmt := fmt.Sprintf("DELETE FROM %s WHERE task_id = %d;", m.tableName, m.taskID)
   823  	err := exec.Exec(ctx, "clean up task", stmt)
   824  	return errors.Trace(err)
   825  }
   826  
   827  func (m *dbTaskMetaMgr) Close() {
   828  	m.pd.Close()
   829  }
   830  
   831  func (m *dbTaskMetaMgr) CleanupAllMetas(ctx context.Context) error {
   832  	exec := &common.SQLWithRetry{
   833  		DB:     m.session,
   834  		Logger: log.L(),
   835  	}
   836  
   837  	// check if all tables are finished
   838  	query := fmt.Sprintf("SELECT COUNT(*) from %s", common.UniqueTable(m.schemaName, tableMetaTableName))
   839  	var cnt int
   840  	if err := exec.QueryRow(ctx, "fetch table meta row count", query, &cnt); err != nil {
   841  		return errors.Trace(err)
   842  	}
   843  	if cnt > 0 {
   844  		log.L().Warn("there are unfinished table in table meta table, cleanup skipped.")
   845  		return nil
   846  	}
   847  
   848  	// avoid override existing metadata if the meta is already inserted.
   849  	stmt := fmt.Sprintf("DROP DATABASE %s;", common.EscapeIdentifier(m.schemaName))
   850  	if err := exec.Exec(ctx, "cleanup task meta tables", stmt); err != nil {
   851  		return errors.Trace(err)
   852  	}
   853  	return nil
   854  }
   855  
   856  type noopMetaMgrBuilder struct{}
   857  
   858  func (b noopMetaMgrBuilder) Init(ctx context.Context) error {
   859  	return nil
   860  }
   861  
   862  func (b noopMetaMgrBuilder) TaskMetaMgr(pd *pdutil.PdController) taskMetaMgr {
   863  	return noopTaskMetaMgr{}
   864  }
   865  
   866  func (b noopMetaMgrBuilder) TableMetaMgr(tr *TableRestore) tableMetaMgr {
   867  	return noopTableMetaMgr{}
   868  }
   869  
   870  type noopTaskMetaMgr struct{}
   871  
   872  func (m noopTaskMetaMgr) InitTask(ctx context.Context, source int64) error {
   873  	return nil
   874  }
   875  
   876  func (m noopTaskMetaMgr) CheckAndPausePdSchedulers(ctx context.Context) (pdutil.UndoFunc, error) {
   877  	return func(ctx context.Context) error {
   878  		return nil
   879  	}, nil
   880  }
   881  
   882  func (m noopTaskMetaMgr) CheckTaskExist(ctx context.Context) (bool, error) {
   883  	return false, nil
   884  }
   885  
   886  func (m noopTaskMetaMgr) CheckClusterSource(ctx context.Context) (int64, error) {
   887  	return 0, nil
   888  }
   889  
   890  func (m noopTaskMetaMgr) CheckAndFinishRestore(context.Context, bool) (bool, bool, error) {
   891  	return false, true, nil
   892  }
   893  
   894  func (m noopTaskMetaMgr) Cleanup(ctx context.Context) error {
   895  	return nil
   896  }
   897  
   898  func (m noopTaskMetaMgr) CleanupTask(ctx context.Context) error {
   899  	return nil
   900  }
   901  
   902  func (m noopTaskMetaMgr) CleanupAllMetas(ctx context.Context) error {
   903  	return nil
   904  }
   905  
   906  func (m noopTaskMetaMgr) Close() {
   907  }
   908  
   909  type noopTableMetaMgr struct{}
   910  
   911  func (m noopTableMetaMgr) InitTableMeta(ctx context.Context) error {
   912  	return nil
   913  }
   914  
   915  func (m noopTableMetaMgr) AllocTableRowIDs(ctx context.Context, rawRowIDMax int64) (*verify.KVChecksum, int64, error) {
   916  	return nil, 0, nil
   917  }
   918  
   919  func (m noopTableMetaMgr) UpdateTableStatus(ctx context.Context, status metaStatus) error {
   920  	return nil
   921  }
   922  
   923  func (m noopTableMetaMgr) UpdateTableBaseChecksum(ctx context.Context, checksum *verify.KVChecksum) error {
   924  	return nil
   925  }
   926  
   927  func (m noopTableMetaMgr) CheckAndUpdateLocalChecksum(ctx context.Context, checksum *verify.KVChecksum) (bool, *verify.KVChecksum, error) {
   928  	return false, nil, nil
   929  }
   930  
   931  func (m noopTableMetaMgr) FinishTable(ctx context.Context) error {
   932  	return nil
   933  }