github.com/pingcap/tidb-lightning@v5.0.0-rc.0.20210428090220-84b649866577+incompatible/lightning/restore/checksum.go (about) 1 package restore 2 3 import ( 4 "container/heap" 5 "context" 6 "database/sql" 7 "fmt" 8 "sync" 9 "sync/atomic" 10 "time" 11 12 "github.com/google/uuid" 13 "github.com/pingcap/br/pkg/checksum" 14 "github.com/pingcap/br/pkg/utils" 15 "github.com/pingcap/errors" 16 "github.com/pingcap/failpoint" 17 tidbcfg "github.com/pingcap/tidb/config" 18 "github.com/pingcap/tidb/kv" 19 "github.com/pingcap/tidb/store/tikv" 20 "github.com/pingcap/tidb/store/tikv/oracle" 21 "github.com/pingcap/tipb/go-tipb" 22 pd "github.com/tikv/pd/client" 23 "go.uber.org/zap" 24 25 . "github.com/pingcap/tidb-lightning/lightning/checkpoints" 26 "github.com/pingcap/tidb-lightning/lightning/common" 27 "github.com/pingcap/tidb-lightning/lightning/config" 28 "github.com/pingcap/tidb-lightning/lightning/log" 29 "github.com/pingcap/tidb-lightning/lightning/metric" 30 ) 31 32 const ( 33 preUpdateServiceSafePointFactor = 3 34 35 maxErrorRetryCount = 3 36 ) 37 38 var ( 39 serviceSafePointTTL int64 = 10 * 60 // 10 min in seconds 40 41 minDistSQLScanConcurrency = 4 42 ) 43 44 // RemoteChecksum represents a checksum result got from tidb. 45 type RemoteChecksum struct { 46 Schema string 47 Table string 48 Checksum uint64 49 TotalKVs uint64 50 TotalBytes uint64 51 } 52 53 type ChecksumManager interface { 54 Checksum(ctx context.Context, tableInfo *TidbTableInfo) (*RemoteChecksum, error) 55 } 56 57 func newChecksumManager(ctx context.Context, rc *RestoreController) (ChecksumManager, error) { 58 // if we don't need checksum, just return nil 59 if rc.cfg.TikvImporter.Backend == config.BackendTiDB || rc.cfg.PostRestore.Checksum == config.OpLevelOff { 60 return nil, nil 61 } 62 63 pdAddr := rc.cfg.TiDB.PdAddr 64 pdVersion, err := common.FetchPDVersion(ctx, rc.tls, pdAddr) 65 if err != nil { 66 return nil, errors.Trace(err) 67 } 68 69 // for v4.0.0 or upper, we can use the gc ttl api 70 var manager ChecksumManager 71 if pdVersion.Major >= 4 { 72 tlsOpt := rc.tls.ToPDSecurityOption() 73 pdCli, err := pd.NewClientWithContext(ctx, []string{pdAddr}, tlsOpt) 74 if err != nil { 75 return nil, errors.Trace(err) 76 } 77 78 // TODO: make tikv.Driver{}.Open use arguments instead of global variables 79 if tlsOpt.CAPath != "" { 80 conf := tidbcfg.GetGlobalConfig() 81 conf.Security.ClusterSSLCA = tlsOpt.CAPath 82 conf.Security.ClusterSSLCert = tlsOpt.CertPath 83 conf.Security.ClusterSSLKey = tlsOpt.KeyPath 84 tidbcfg.StoreGlobalConfig(conf) 85 } 86 store, err := tikv.Driver{}.Open(fmt.Sprintf("tikv://%s?disableGC=true", pdAddr)) 87 if err != nil { 88 return nil, errors.Trace(err) 89 } 90 91 manager = newTiKVChecksumManager(store.(tikv.Storage).GetClient(), pdCli, uint(rc.cfg.TiDB.DistSQLScanConcurrency)) 92 } else { 93 db, err := rc.tidbGlue.GetDB() 94 if err != nil { 95 return nil, errors.Trace(err) 96 } 97 manager = newTiDBChecksumExecutor(db) 98 } 99 100 return manager, nil 101 } 102 103 // fetch checksum for tidb sql client 104 type tidbChecksumExecutor struct { 105 db *sql.DB 106 manager *gcLifeTimeManager 107 } 108 109 func newTiDBChecksumExecutor(db *sql.DB) *tidbChecksumExecutor { 110 return &tidbChecksumExecutor{ 111 db: db, 112 manager: newGCLifeTimeManager(), 113 } 114 } 115 116 func (e *tidbChecksumExecutor) Checksum(ctx context.Context, tableInfo *TidbTableInfo) (*RemoteChecksum, error) { 117 var err error 118 if err = e.manager.addOneJob(ctx, e.db); err != nil { 119 return nil, err 120 } 121 122 // set it back finally 123 defer e.manager.removeOneJob(ctx, e.db) 124 125 tableName := common.UniqueTable(tableInfo.DB, tableInfo.Name) 126 127 task := log.With(zap.String("table", tableName)).Begin(zap.InfoLevel, "remote checksum") 128 129 // ADMIN CHECKSUM TABLE <table>,<table> example. 130 // mysql> admin checksum table test.t; 131 // +---------+------------+---------------------+-----------+-------------+ 132 // | Db_name | Table_name | Checksum_crc64_xor | Total_kvs | Total_bytes | 133 // +---------+------------+---------------------+-----------+-------------+ 134 // | test | t | 8520875019404689597 | 7296873 | 357601387 | 135 // +---------+------------+---------------------+-----------+-------------+ 136 137 cs := RemoteChecksum{} 138 err = common.SQLWithRetry{DB: e.db, Logger: task.Logger}.QueryRow(ctx, "compute remote checksum", 139 "ADMIN CHECKSUM TABLE "+tableName, &cs.Schema, &cs.Table, &cs.Checksum, &cs.TotalKVs, &cs.TotalBytes, 140 ) 141 dur := task.End(zap.ErrorLevel, err) 142 metric.ChecksumSecondsHistogram.Observe(dur.Seconds()) 143 if err != nil { 144 return nil, errors.Trace(err) 145 } 146 return &cs, nil 147 } 148 149 // DoChecksum do checksum for tables. 150 // table should be in <db>.<table>, format. e.g. foo.bar 151 func DoChecksum(ctx context.Context, table *TidbTableInfo) (*RemoteChecksum, error) { 152 var err error 153 manager, ok := ctx.Value(&checksumManagerKey).(ChecksumManager) 154 if !ok { 155 return nil, errors.New("No gcLifeTimeManager found in context, check context initialization") 156 } 157 158 task := log.With(zap.String("table", table.Name)).Begin(zap.InfoLevel, "remote checksum") 159 160 cs, err := manager.Checksum(ctx, table) 161 dur := task.End(zap.ErrorLevel, err) 162 metric.ChecksumSecondsHistogram.Observe(dur.Seconds()) 163 164 return cs, err 165 } 166 167 type gcLifeTimeManager struct { 168 runningJobsLock sync.Mutex 169 runningJobs int 170 oriGCLifeTime string 171 } 172 173 func newGCLifeTimeManager() *gcLifeTimeManager { 174 // Default values of three member are enough to initialize this struct 175 return &gcLifeTimeManager{} 176 } 177 178 // Pre- and post-condition: 179 // if m.runningJobs == 0, GC life time has not been increased. 180 // if m.runningJobs > 0, GC life time has been increased. 181 // m.runningJobs won't be negative(overflow) since index concurrency is relatively small 182 func (m *gcLifeTimeManager) addOneJob(ctx context.Context, db *sql.DB) error { 183 m.runningJobsLock.Lock() 184 defer m.runningJobsLock.Unlock() 185 186 if m.runningJobs == 0 { 187 oriGCLifeTime, err := ObtainGCLifeTime(ctx, db) 188 if err != nil { 189 return err 190 } 191 m.oriGCLifeTime = oriGCLifeTime 192 err = increaseGCLifeTime(ctx, m, db) 193 if err != nil { 194 return err 195 } 196 } 197 m.runningJobs += 1 198 return nil 199 } 200 201 // Pre- and post-condition: 202 // if m.runningJobs == 0, GC life time has been tried to recovered. If this try fails, a warning will be printed. 203 // if m.runningJobs > 0, GC life time has not been recovered. 204 // m.runningJobs won't minus to negative since removeOneJob follows a successful addOneJob. 205 func (m *gcLifeTimeManager) removeOneJob(ctx context.Context, db *sql.DB) { 206 m.runningJobsLock.Lock() 207 defer m.runningJobsLock.Unlock() 208 209 m.runningJobs -= 1 210 if m.runningJobs == 0 { 211 err := UpdateGCLifeTime(ctx, db, m.oriGCLifeTime) 212 if err != nil { 213 query := fmt.Sprintf( 214 "UPDATE mysql.tidb SET VARIABLE_VALUE = '%s' WHERE VARIABLE_NAME = 'tikv_gc_life_time'", 215 m.oriGCLifeTime, 216 ) 217 log.L().Warn("revert GC lifetime failed, please reset the GC lifetime manually after Lightning completed", 218 zap.String("query", query), 219 log.ShortError(err), 220 ) 221 } 222 } 223 } 224 225 func increaseGCLifeTime(ctx context.Context, manager *gcLifeTimeManager, db *sql.DB) (err error) { 226 // checksum command usually takes a long time to execute, 227 // so here need to increase the gcLifeTime for single transaction. 228 var increaseGCLifeTime bool 229 if manager.oriGCLifeTime != "" { 230 ori, err := time.ParseDuration(manager.oriGCLifeTime) 231 if err != nil { 232 return errors.Trace(err) 233 } 234 if ori < defaultGCLifeTime { 235 increaseGCLifeTime = true 236 } 237 } else { 238 increaseGCLifeTime = true 239 } 240 241 if increaseGCLifeTime { 242 err = UpdateGCLifeTime(ctx, db, defaultGCLifeTime.String()) 243 if err != nil { 244 return err 245 } 246 } 247 248 failpoint.Inject("IncreaseGCUpdateDuration", nil) 249 250 return nil 251 } 252 253 type tikvChecksumManager struct { 254 client kv.Client 255 manager gcTTLManager 256 distSQLScanConcurrency uint 257 } 258 259 // newTiKVChecksumManager return a new tikv checksum manager 260 func newTiKVChecksumManager(client kv.Client, pdClient pd.Client, distSQLScanConcurrency uint) *tikvChecksumManager { 261 return &tikvChecksumManager{ 262 client: client, 263 manager: newGCTTLManager(pdClient), 264 distSQLScanConcurrency: distSQLScanConcurrency, 265 } 266 } 267 268 func (e *tikvChecksumManager) checksumDB(ctx context.Context, tableInfo *TidbTableInfo) (*RemoteChecksum, error) { 269 executor, err := checksum.NewExecutorBuilder(tableInfo.Core, oracle.ComposeTS(time.Now().Unix()*1000, 0)). 270 SetConcurrency(e.distSQLScanConcurrency). 271 Build() 272 if err != nil { 273 return nil, errors.Trace(err) 274 } 275 276 distSQLScanConcurrency := int(e.distSQLScanConcurrency) 277 for i := 0; i < maxErrorRetryCount; i++ { 278 _ = executor.Each(func(request *kv.Request) error { 279 request.Concurrency = distSQLScanConcurrency 280 return nil 281 }) 282 var execRes *tipb.ChecksumResponse 283 execRes, err = executor.Execute(ctx, e.client, func() {}) 284 if err == nil { 285 return &RemoteChecksum{ 286 Schema: tableInfo.DB, 287 Table: tableInfo.Name, 288 Checksum: execRes.Checksum, 289 TotalBytes: execRes.TotalBytes, 290 TotalKVs: execRes.TotalKvs, 291 }, nil 292 } 293 294 log.L().Warn("remote checksum failed", zap.String("db", tableInfo.DB), 295 zap.String("table", tableInfo.Name), zap.Error(err), 296 zap.Int("concurrency", distSQLScanConcurrency), zap.Int("retry", i)) 297 298 // do not retry context.Canceled error 299 if !common.IsRetryableError(err) { 300 break 301 } 302 if distSQLScanConcurrency > minDistSQLScanConcurrency { 303 distSQLScanConcurrency = utils.MaxInt(distSQLScanConcurrency/2, minDistSQLScanConcurrency) 304 } 305 } 306 307 return nil, err 308 } 309 310 func (e *tikvChecksumManager) Checksum(ctx context.Context, tableInfo *TidbTableInfo) (*RemoteChecksum, error) { 311 tbl := common.UniqueTable(tableInfo.DB, tableInfo.Name) 312 err := e.manager.addOneJob(ctx, tbl, oracle.ComposeTS(time.Now().Unix()*1000, 0)) 313 if err != nil { 314 return nil, errors.Trace(err) 315 } 316 317 return e.checksumDB(ctx, tableInfo) 318 } 319 320 type tableChecksumTS struct { 321 table string 322 gcSafeTS uint64 323 } 324 325 // following function are for implement `heap.Interface` 326 327 func (m *gcTTLManager) Len() int { 328 return len(m.tableGCSafeTS) 329 } 330 331 func (m *gcTTLManager) Less(i, j int) bool { 332 return m.tableGCSafeTS[i].gcSafeTS < m.tableGCSafeTS[j].gcSafeTS 333 } 334 335 func (m *gcTTLManager) Swap(i, j int) { 336 m.tableGCSafeTS[i], m.tableGCSafeTS[j] = m.tableGCSafeTS[j], m.tableGCSafeTS[i] 337 } 338 339 func (m *gcTTLManager) Push(x interface{}) { 340 m.tableGCSafeTS = append(m.tableGCSafeTS, x.(*tableChecksumTS)) 341 } 342 343 func (m *gcTTLManager) Pop() interface{} { 344 i := m.tableGCSafeTS[len(m.tableGCSafeTS)-1] 345 m.tableGCSafeTS = m.tableGCSafeTS[:len(m.tableGCSafeTS)-1] 346 return i 347 } 348 349 type gcTTLManager struct { 350 lock sync.Mutex 351 pdClient pd.Client 352 // tableGCSafeTS is a binary heap that stored active checksum jobs GC safe point ts 353 tableGCSafeTS []*tableChecksumTS 354 currentTs uint64 355 serviceID string 356 // 0 for not start, otherwise started 357 started uint32 358 } 359 360 func newGCTTLManager(pdClient pd.Client) gcTTLManager { 361 return gcTTLManager{ 362 pdClient: pdClient, 363 serviceID: fmt.Sprintf("lightning-%s", uuid.New()), 364 } 365 } 366 367 func (m *gcTTLManager) addOneJob(ctx context.Context, table string, ts uint64) error { 368 // start gc ttl loop if not started yet. 369 if atomic.CompareAndSwapUint32(&m.started, 0, 1) { 370 m.start(ctx) 371 } 372 m.lock.Lock() 373 defer m.lock.Unlock() 374 var curTs uint64 375 if len(m.tableGCSafeTS) > 0 { 376 curTs = m.tableGCSafeTS[0].gcSafeTS 377 } 378 m.Push(&tableChecksumTS{table: table, gcSafeTS: ts}) 379 heap.Fix(m, len(m.tableGCSafeTS)-1) 380 m.currentTs = m.tableGCSafeTS[0].gcSafeTS 381 if curTs == 0 || m.currentTs < curTs { 382 return m.doUpdateGCTTL(ctx, m.currentTs) 383 } 384 return nil 385 } 386 387 func (m *gcTTLManager) removeOneJob(table string) { 388 m.lock.Lock() 389 defer m.lock.Unlock() 390 idx := -1 391 for i := 0; i < len(m.tableGCSafeTS); i++ { 392 if m.tableGCSafeTS[i].table == table { 393 idx = i 394 break 395 } 396 } 397 398 if idx >= 0 { 399 l := len(m.tableGCSafeTS) 400 m.tableGCSafeTS[idx] = m.tableGCSafeTS[l-1] 401 m.tableGCSafeTS = m.tableGCSafeTS[:l-1] 402 if l > 1 && idx < l-1 { 403 heap.Fix(m, idx) 404 } 405 } 406 407 var newTs uint64 408 if len(m.tableGCSafeTS) > 0 { 409 newTs = m.tableGCSafeTS[0].gcSafeTS 410 } 411 m.currentTs = newTs 412 } 413 414 func (m *gcTTLManager) updateGCTTL(ctx context.Context) error { 415 m.lock.Lock() 416 currentTs := m.currentTs 417 m.lock.Unlock() 418 return m.doUpdateGCTTL(ctx, currentTs) 419 } 420 421 func (m *gcTTLManager) doUpdateGCTTL(ctx context.Context, ts uint64) error { 422 log.L().Debug("update PD safePoint limit with TTL", 423 zap.Uint64("currnet_ts", ts)) 424 var err error 425 if ts > 0 { 426 _, err = m.pdClient.UpdateServiceGCSafePoint(ctx, 427 m.serviceID, serviceSafePointTTL, ts) 428 } 429 return err 430 } 431 432 func (m *gcTTLManager) start(ctx context.Context) { 433 // It would be OK since TTL won't be zero, so gapTime should > `0. 434 updateGapTime := time.Duration(serviceSafePointTTL) * time.Second / preUpdateServiceSafePointFactor 435 436 updateTick := time.NewTicker(updateGapTime) 437 438 updateGCTTL := func() { 439 if err := m.updateGCTTL(ctx); err != nil { 440 log.L().Warn("failed to update service safe point, checksum may fail if gc triggered", zap.Error(err)) 441 } 442 } 443 444 // trigger a service gc ttl at start 445 updateGCTTL() 446 go func() { 447 defer updateTick.Stop() 448 for { 449 select { 450 case <-ctx.Done(): 451 log.L().Info("service safe point keeper exited") 452 return 453 case <-updateTick.C: 454 updateGCTTL() 455 } 456 } 457 }() 458 }