github.com/pingcap/tidb-lightning@v5.0.0-rc.0.20210428090220-84b649866577+incompatible/lightning/restore/checksum.go (about)

     1  package restore
     2  
     3  import (
     4  	"container/heap"
     5  	"context"
     6  	"database/sql"
     7  	"fmt"
     8  	"sync"
     9  	"sync/atomic"
    10  	"time"
    11  
    12  	"github.com/google/uuid"
    13  	"github.com/pingcap/br/pkg/checksum"
    14  	"github.com/pingcap/br/pkg/utils"
    15  	"github.com/pingcap/errors"
    16  	"github.com/pingcap/failpoint"
    17  	tidbcfg "github.com/pingcap/tidb/config"
    18  	"github.com/pingcap/tidb/kv"
    19  	"github.com/pingcap/tidb/store/tikv"
    20  	"github.com/pingcap/tidb/store/tikv/oracle"
    21  	"github.com/pingcap/tipb/go-tipb"
    22  	pd "github.com/tikv/pd/client"
    23  	"go.uber.org/zap"
    24  
    25  	. "github.com/pingcap/tidb-lightning/lightning/checkpoints"
    26  	"github.com/pingcap/tidb-lightning/lightning/common"
    27  	"github.com/pingcap/tidb-lightning/lightning/config"
    28  	"github.com/pingcap/tidb-lightning/lightning/log"
    29  	"github.com/pingcap/tidb-lightning/lightning/metric"
    30  )
    31  
    32  const (
    33  	preUpdateServiceSafePointFactor = 3
    34  
    35  	maxErrorRetryCount = 3
    36  )
    37  
    38  var (
    39  	serviceSafePointTTL int64 = 10 * 60 // 10 min in seconds
    40  
    41  	minDistSQLScanConcurrency = 4
    42  )
    43  
    44  // RemoteChecksum represents a checksum result got from tidb.
    45  type RemoteChecksum struct {
    46  	Schema     string
    47  	Table      string
    48  	Checksum   uint64
    49  	TotalKVs   uint64
    50  	TotalBytes uint64
    51  }
    52  
    53  type ChecksumManager interface {
    54  	Checksum(ctx context.Context, tableInfo *TidbTableInfo) (*RemoteChecksum, error)
    55  }
    56  
    57  func newChecksumManager(ctx context.Context, rc *RestoreController) (ChecksumManager, error) {
    58  	// if we don't need checksum, just return nil
    59  	if rc.cfg.TikvImporter.Backend == config.BackendTiDB || rc.cfg.PostRestore.Checksum == config.OpLevelOff {
    60  		return nil, nil
    61  	}
    62  
    63  	pdAddr := rc.cfg.TiDB.PdAddr
    64  	pdVersion, err := common.FetchPDVersion(ctx, rc.tls, pdAddr)
    65  	if err != nil {
    66  		return nil, errors.Trace(err)
    67  	}
    68  
    69  	// for v4.0.0 or upper, we can use the gc ttl api
    70  	var manager ChecksumManager
    71  	if pdVersion.Major >= 4 {
    72  		tlsOpt := rc.tls.ToPDSecurityOption()
    73  		pdCli, err := pd.NewClientWithContext(ctx, []string{pdAddr}, tlsOpt)
    74  		if err != nil {
    75  			return nil, errors.Trace(err)
    76  		}
    77  
    78  		// TODO: make tikv.Driver{}.Open use arguments instead of global variables
    79  		if tlsOpt.CAPath != "" {
    80  			conf := tidbcfg.GetGlobalConfig()
    81  			conf.Security.ClusterSSLCA = tlsOpt.CAPath
    82  			conf.Security.ClusterSSLCert = tlsOpt.CertPath
    83  			conf.Security.ClusterSSLKey = tlsOpt.KeyPath
    84  			tidbcfg.StoreGlobalConfig(conf)
    85  		}
    86  		store, err := tikv.Driver{}.Open(fmt.Sprintf("tikv://%s?disableGC=true", pdAddr))
    87  		if err != nil {
    88  			return nil, errors.Trace(err)
    89  		}
    90  
    91  		manager = newTiKVChecksumManager(store.(tikv.Storage).GetClient(), pdCli, uint(rc.cfg.TiDB.DistSQLScanConcurrency))
    92  	} else {
    93  		db, err := rc.tidbGlue.GetDB()
    94  		if err != nil {
    95  			return nil, errors.Trace(err)
    96  		}
    97  		manager = newTiDBChecksumExecutor(db)
    98  	}
    99  
   100  	return manager, nil
   101  }
   102  
   103  // fetch checksum for tidb sql client
   104  type tidbChecksumExecutor struct {
   105  	db      *sql.DB
   106  	manager *gcLifeTimeManager
   107  }
   108  
   109  func newTiDBChecksumExecutor(db *sql.DB) *tidbChecksumExecutor {
   110  	return &tidbChecksumExecutor{
   111  		db:      db,
   112  		manager: newGCLifeTimeManager(),
   113  	}
   114  }
   115  
   116  func (e *tidbChecksumExecutor) Checksum(ctx context.Context, tableInfo *TidbTableInfo) (*RemoteChecksum, error) {
   117  	var err error
   118  	if err = e.manager.addOneJob(ctx, e.db); err != nil {
   119  		return nil, err
   120  	}
   121  
   122  	// set it back finally
   123  	defer e.manager.removeOneJob(ctx, e.db)
   124  
   125  	tableName := common.UniqueTable(tableInfo.DB, tableInfo.Name)
   126  
   127  	task := log.With(zap.String("table", tableName)).Begin(zap.InfoLevel, "remote checksum")
   128  
   129  	// ADMIN CHECKSUM TABLE <table>,<table>  example.
   130  	// 	mysql> admin checksum table test.t;
   131  	// +---------+------------+---------------------+-----------+-------------+
   132  	// | Db_name | Table_name | Checksum_crc64_xor  | Total_kvs | Total_bytes |
   133  	// +---------+------------+---------------------+-----------+-------------+
   134  	// | test    | t          | 8520875019404689597 |   7296873 |   357601387 |
   135  	// +---------+------------+---------------------+-----------+-------------+
   136  
   137  	cs := RemoteChecksum{}
   138  	err = common.SQLWithRetry{DB: e.db, Logger: task.Logger}.QueryRow(ctx, "compute remote checksum",
   139  		"ADMIN CHECKSUM TABLE "+tableName, &cs.Schema, &cs.Table, &cs.Checksum, &cs.TotalKVs, &cs.TotalBytes,
   140  	)
   141  	dur := task.End(zap.ErrorLevel, err)
   142  	metric.ChecksumSecondsHistogram.Observe(dur.Seconds())
   143  	if err != nil {
   144  		return nil, errors.Trace(err)
   145  	}
   146  	return &cs, nil
   147  }
   148  
   149  // DoChecksum do checksum for tables.
   150  // table should be in <db>.<table>, format.  e.g. foo.bar
   151  func DoChecksum(ctx context.Context, table *TidbTableInfo) (*RemoteChecksum, error) {
   152  	var err error
   153  	manager, ok := ctx.Value(&checksumManagerKey).(ChecksumManager)
   154  	if !ok {
   155  		return nil, errors.New("No gcLifeTimeManager found in context, check context initialization")
   156  	}
   157  
   158  	task := log.With(zap.String("table", table.Name)).Begin(zap.InfoLevel, "remote checksum")
   159  
   160  	cs, err := manager.Checksum(ctx, table)
   161  	dur := task.End(zap.ErrorLevel, err)
   162  	metric.ChecksumSecondsHistogram.Observe(dur.Seconds())
   163  
   164  	return cs, err
   165  }
   166  
   167  type gcLifeTimeManager struct {
   168  	runningJobsLock sync.Mutex
   169  	runningJobs     int
   170  	oriGCLifeTime   string
   171  }
   172  
   173  func newGCLifeTimeManager() *gcLifeTimeManager {
   174  	// Default values of three member are enough to initialize this struct
   175  	return &gcLifeTimeManager{}
   176  }
   177  
   178  // Pre- and post-condition:
   179  // if m.runningJobs == 0, GC life time has not been increased.
   180  // if m.runningJobs > 0, GC life time has been increased.
   181  // m.runningJobs won't be negative(overflow) since index concurrency is relatively small
   182  func (m *gcLifeTimeManager) addOneJob(ctx context.Context, db *sql.DB) error {
   183  	m.runningJobsLock.Lock()
   184  	defer m.runningJobsLock.Unlock()
   185  
   186  	if m.runningJobs == 0 {
   187  		oriGCLifeTime, err := ObtainGCLifeTime(ctx, db)
   188  		if err != nil {
   189  			return err
   190  		}
   191  		m.oriGCLifeTime = oriGCLifeTime
   192  		err = increaseGCLifeTime(ctx, m, db)
   193  		if err != nil {
   194  			return err
   195  		}
   196  	}
   197  	m.runningJobs += 1
   198  	return nil
   199  }
   200  
   201  // Pre- and post-condition:
   202  // if m.runningJobs == 0, GC life time has been tried to recovered. If this try fails, a warning will be printed.
   203  // if m.runningJobs > 0, GC life time has not been recovered.
   204  // m.runningJobs won't minus to negative since removeOneJob follows a successful addOneJob.
   205  func (m *gcLifeTimeManager) removeOneJob(ctx context.Context, db *sql.DB) {
   206  	m.runningJobsLock.Lock()
   207  	defer m.runningJobsLock.Unlock()
   208  
   209  	m.runningJobs -= 1
   210  	if m.runningJobs == 0 {
   211  		err := UpdateGCLifeTime(ctx, db, m.oriGCLifeTime)
   212  		if err != nil {
   213  			query := fmt.Sprintf(
   214  				"UPDATE mysql.tidb SET VARIABLE_VALUE = '%s' WHERE VARIABLE_NAME = 'tikv_gc_life_time'",
   215  				m.oriGCLifeTime,
   216  			)
   217  			log.L().Warn("revert GC lifetime failed, please reset the GC lifetime manually after Lightning completed",
   218  				zap.String("query", query),
   219  				log.ShortError(err),
   220  			)
   221  		}
   222  	}
   223  }
   224  
   225  func increaseGCLifeTime(ctx context.Context, manager *gcLifeTimeManager, db *sql.DB) (err error) {
   226  	// checksum command usually takes a long time to execute,
   227  	// so here need to increase the gcLifeTime for single transaction.
   228  	var increaseGCLifeTime bool
   229  	if manager.oriGCLifeTime != "" {
   230  		ori, err := time.ParseDuration(manager.oriGCLifeTime)
   231  		if err != nil {
   232  			return errors.Trace(err)
   233  		}
   234  		if ori < defaultGCLifeTime {
   235  			increaseGCLifeTime = true
   236  		}
   237  	} else {
   238  		increaseGCLifeTime = true
   239  	}
   240  
   241  	if increaseGCLifeTime {
   242  		err = UpdateGCLifeTime(ctx, db, defaultGCLifeTime.String())
   243  		if err != nil {
   244  			return err
   245  		}
   246  	}
   247  
   248  	failpoint.Inject("IncreaseGCUpdateDuration", nil)
   249  
   250  	return nil
   251  }
   252  
   253  type tikvChecksumManager struct {
   254  	client                 kv.Client
   255  	manager                gcTTLManager
   256  	distSQLScanConcurrency uint
   257  }
   258  
   259  // newTiKVChecksumManager return a new tikv checksum manager
   260  func newTiKVChecksumManager(client kv.Client, pdClient pd.Client, distSQLScanConcurrency uint) *tikvChecksumManager {
   261  	return &tikvChecksumManager{
   262  		client:                 client,
   263  		manager:                newGCTTLManager(pdClient),
   264  		distSQLScanConcurrency: distSQLScanConcurrency,
   265  	}
   266  }
   267  
   268  func (e *tikvChecksumManager) checksumDB(ctx context.Context, tableInfo *TidbTableInfo) (*RemoteChecksum, error) {
   269  	executor, err := checksum.NewExecutorBuilder(tableInfo.Core, oracle.ComposeTS(time.Now().Unix()*1000, 0)).
   270  		SetConcurrency(e.distSQLScanConcurrency).
   271  		Build()
   272  	if err != nil {
   273  		return nil, errors.Trace(err)
   274  	}
   275  
   276  	distSQLScanConcurrency := int(e.distSQLScanConcurrency)
   277  	for i := 0; i < maxErrorRetryCount; i++ {
   278  		_ = executor.Each(func(request *kv.Request) error {
   279  			request.Concurrency = distSQLScanConcurrency
   280  			return nil
   281  		})
   282  		var execRes *tipb.ChecksumResponse
   283  		execRes, err = executor.Execute(ctx, e.client, func() {})
   284  		if err == nil {
   285  			return &RemoteChecksum{
   286  				Schema:     tableInfo.DB,
   287  				Table:      tableInfo.Name,
   288  				Checksum:   execRes.Checksum,
   289  				TotalBytes: execRes.TotalBytes,
   290  				TotalKVs:   execRes.TotalKvs,
   291  			}, nil
   292  		}
   293  
   294  		log.L().Warn("remote checksum failed", zap.String("db", tableInfo.DB),
   295  			zap.String("table", tableInfo.Name), zap.Error(err),
   296  			zap.Int("concurrency", distSQLScanConcurrency), zap.Int("retry", i))
   297  
   298  		// do not retry context.Canceled error
   299  		if !common.IsRetryableError(err) {
   300  			break
   301  		}
   302  		if distSQLScanConcurrency > minDistSQLScanConcurrency {
   303  			distSQLScanConcurrency = utils.MaxInt(distSQLScanConcurrency/2, minDistSQLScanConcurrency)
   304  		}
   305  	}
   306  
   307  	return nil, err
   308  }
   309  
   310  func (e *tikvChecksumManager) Checksum(ctx context.Context, tableInfo *TidbTableInfo) (*RemoteChecksum, error) {
   311  	tbl := common.UniqueTable(tableInfo.DB, tableInfo.Name)
   312  	err := e.manager.addOneJob(ctx, tbl, oracle.ComposeTS(time.Now().Unix()*1000, 0))
   313  	if err != nil {
   314  		return nil, errors.Trace(err)
   315  	}
   316  
   317  	return e.checksumDB(ctx, tableInfo)
   318  }
   319  
   320  type tableChecksumTS struct {
   321  	table    string
   322  	gcSafeTS uint64
   323  }
   324  
   325  // following function are for implement `heap.Interface`
   326  
   327  func (m *gcTTLManager) Len() int {
   328  	return len(m.tableGCSafeTS)
   329  }
   330  
   331  func (m *gcTTLManager) Less(i, j int) bool {
   332  	return m.tableGCSafeTS[i].gcSafeTS < m.tableGCSafeTS[j].gcSafeTS
   333  }
   334  
   335  func (m *gcTTLManager) Swap(i, j int) {
   336  	m.tableGCSafeTS[i], m.tableGCSafeTS[j] = m.tableGCSafeTS[j], m.tableGCSafeTS[i]
   337  }
   338  
   339  func (m *gcTTLManager) Push(x interface{}) {
   340  	m.tableGCSafeTS = append(m.tableGCSafeTS, x.(*tableChecksumTS))
   341  }
   342  
   343  func (m *gcTTLManager) Pop() interface{} {
   344  	i := m.tableGCSafeTS[len(m.tableGCSafeTS)-1]
   345  	m.tableGCSafeTS = m.tableGCSafeTS[:len(m.tableGCSafeTS)-1]
   346  	return i
   347  }
   348  
   349  type gcTTLManager struct {
   350  	lock     sync.Mutex
   351  	pdClient pd.Client
   352  	// tableGCSafeTS is a binary heap that stored active checksum jobs GC safe point ts
   353  	tableGCSafeTS []*tableChecksumTS
   354  	currentTs     uint64
   355  	serviceID     string
   356  	// 0 for not start, otherwise started
   357  	started uint32
   358  }
   359  
   360  func newGCTTLManager(pdClient pd.Client) gcTTLManager {
   361  	return gcTTLManager{
   362  		pdClient:  pdClient,
   363  		serviceID: fmt.Sprintf("lightning-%s", uuid.New()),
   364  	}
   365  }
   366  
   367  func (m *gcTTLManager) addOneJob(ctx context.Context, table string, ts uint64) error {
   368  	// start gc ttl loop if not started yet.
   369  	if atomic.CompareAndSwapUint32(&m.started, 0, 1) {
   370  		m.start(ctx)
   371  	}
   372  	m.lock.Lock()
   373  	defer m.lock.Unlock()
   374  	var curTs uint64
   375  	if len(m.tableGCSafeTS) > 0 {
   376  		curTs = m.tableGCSafeTS[0].gcSafeTS
   377  	}
   378  	m.Push(&tableChecksumTS{table: table, gcSafeTS: ts})
   379  	heap.Fix(m, len(m.tableGCSafeTS)-1)
   380  	m.currentTs = m.tableGCSafeTS[0].gcSafeTS
   381  	if curTs == 0 || m.currentTs < curTs {
   382  		return m.doUpdateGCTTL(ctx, m.currentTs)
   383  	}
   384  	return nil
   385  }
   386  
   387  func (m *gcTTLManager) removeOneJob(table string) {
   388  	m.lock.Lock()
   389  	defer m.lock.Unlock()
   390  	idx := -1
   391  	for i := 0; i < len(m.tableGCSafeTS); i++ {
   392  		if m.tableGCSafeTS[i].table == table {
   393  			idx = i
   394  			break
   395  		}
   396  	}
   397  
   398  	if idx >= 0 {
   399  		l := len(m.tableGCSafeTS)
   400  		m.tableGCSafeTS[idx] = m.tableGCSafeTS[l-1]
   401  		m.tableGCSafeTS = m.tableGCSafeTS[:l-1]
   402  		if l > 1 && idx < l-1 {
   403  			heap.Fix(m, idx)
   404  		}
   405  	}
   406  
   407  	var newTs uint64
   408  	if len(m.tableGCSafeTS) > 0 {
   409  		newTs = m.tableGCSafeTS[0].gcSafeTS
   410  	}
   411  	m.currentTs = newTs
   412  }
   413  
   414  func (m *gcTTLManager) updateGCTTL(ctx context.Context) error {
   415  	m.lock.Lock()
   416  	currentTs := m.currentTs
   417  	m.lock.Unlock()
   418  	return m.doUpdateGCTTL(ctx, currentTs)
   419  }
   420  
   421  func (m *gcTTLManager) doUpdateGCTTL(ctx context.Context, ts uint64) error {
   422  	log.L().Debug("update PD safePoint limit with TTL",
   423  		zap.Uint64("currnet_ts", ts))
   424  	var err error
   425  	if ts > 0 {
   426  		_, err = m.pdClient.UpdateServiceGCSafePoint(ctx,
   427  			m.serviceID, serviceSafePointTTL, ts)
   428  	}
   429  	return err
   430  }
   431  
   432  func (m *gcTTLManager) start(ctx context.Context) {
   433  	// It would be OK since TTL won't be zero, so gapTime should > `0.
   434  	updateGapTime := time.Duration(serviceSafePointTTL) * time.Second / preUpdateServiceSafePointFactor
   435  
   436  	updateTick := time.NewTicker(updateGapTime)
   437  
   438  	updateGCTTL := func() {
   439  		if err := m.updateGCTTL(ctx); err != nil {
   440  			log.L().Warn("failed to update service safe point, checksum may fail if gc triggered", zap.Error(err))
   441  		}
   442  	}
   443  
   444  	// trigger a service gc ttl at start
   445  	updateGCTTL()
   446  	go func() {
   447  		defer updateTick.Stop()
   448  		for {
   449  			select {
   450  			case <-ctx.Done():
   451  				log.L().Info("service safe point keeper exited")
   452  				return
   453  			case <-updateTick.C:
   454  				updateGCTTL()
   455  			}
   456  		}
   457  	}()
   458  }