github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/master/shardddl/optimist.go (about) 1 // Copyright 2020 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package shardddl 15 16 import ( 17 "context" 18 "fmt" 19 "sort" 20 "sync" 21 "time" 22 23 "github.com/pingcap/failpoint" 24 "github.com/pingcap/tidb/pkg/util/dbutil" 25 "github.com/pingcap/tiflow/dm/common" 26 "github.com/pingcap/tiflow/dm/config" 27 "github.com/pingcap/tiflow/dm/config/dbconfig" 28 "github.com/pingcap/tiflow/dm/master/metrics" 29 "github.com/pingcap/tiflow/dm/pb" 30 "github.com/pingcap/tiflow/dm/pkg/etcdutil" 31 "github.com/pingcap/tiflow/dm/pkg/log" 32 "github.com/pingcap/tiflow/dm/pkg/shardddl/optimism" 33 "github.com/pingcap/tiflow/dm/pkg/terror" 34 "github.com/pingcap/tiflow/dm/pkg/utils" 35 clientv3 "go.etcd.io/etcd/client/v3" 36 "go.uber.org/zap" 37 ) 38 39 // Optimist is used to coordinate the shard DDL migration in optimism mode. 40 type Optimist struct { 41 mu sync.Mutex 42 43 logger log.Logger 44 45 closed bool 46 cancel context.CancelFunc 47 wg sync.WaitGroup 48 49 cli *clientv3.Client 50 lk *optimism.LockKeeper 51 tk *optimism.TableKeeper 52 } 53 54 // NewOptimist creates a new Optimist instance. 55 func NewOptimist(pLogger *log.Logger, getDownstreamMetaFunc func(string) (*dbconfig.DBConfig, string)) *Optimist { 56 return &Optimist{ 57 logger: pLogger.WithFields(zap.String("component", "shard DDL optimist")), 58 closed: true, 59 lk: optimism.NewLockKeeper(getDownstreamMetaFunc), 60 tk: optimism.NewTableKeeper(), 61 } 62 } 63 64 // Start starts the shard DDL coordination in optimism mode. 65 // NOTE: for logic errors, it should start without returning errors (but report via metrics or log) so that the user can fix them. 66 func (o *Optimist) Start(pCtx context.Context, etcdCli *clientv3.Client) error { 67 o.logger.Info("the shard DDL optimist is starting") 68 69 o.mu.Lock() 70 defer o.mu.Unlock() 71 72 o.cli = etcdCli // o.cli should be set before watching and recover locks because these operations need o.cli 73 74 revSource, revInfo, revOperation, err := o.rebuildLocks() 75 if err != nil { 76 return err 77 } 78 79 ctx, cancel := context.WithCancel(pCtx) 80 81 o.wg.Add(1) 82 go func() { 83 defer o.wg.Done() 84 // TODO: handle fatal error from run 85 //nolint:errcheck 86 o.run(ctx, revSource, revInfo, revOperation) 87 }() 88 89 o.closed = false // started now, no error will interrupt the start process. 90 o.cancel = cancel 91 o.logger.Info("the shard DDL optimist has started") 92 return nil 93 } 94 95 // Close closes the Optimist instance. 96 func (o *Optimist) Close() { 97 o.mu.Lock() 98 if o.closed { 99 o.mu.Unlock() 100 return 101 } 102 103 if o.cancel != nil { 104 o.cancel() 105 o.cancel = nil 106 } 107 108 o.closed = true // closed now. 109 o.mu.Unlock() 110 // unlock before wg.Wait() to avoid deadlock because other goroutines acquire the lock. 111 // such as https://github.com/pingcap/tiflow/blob/92fc4c4/dm/dm/master/shardddl/optimist.go#L686 112 o.wg.Wait() 113 o.logger.Info("the shard DDL optimist has closed") 114 } 115 116 // Locks return all shard DDL locks current exist. 117 func (o *Optimist) Locks() map[string]*optimism.Lock { 118 return o.lk.Locks() 119 } 120 121 // ShowLocks is used by `show-ddl-locks` command. 122 func (o *Optimist) ShowLocks(task string, sources []string) ([]*pb.DDLLock, error) { 123 locks := o.lk.Locks() 124 ret := make([]*pb.DDLLock, 0, len(locks)) 125 var ifm map[string]map[string]map[string]map[string]optimism.Info 126 opm, _, err := optimism.GetAllOperations(o.cli) 127 if err == nil { 128 ifm, _, err = optimism.GetAllInfo(o.cli) 129 } 130 for _, lock := range locks { 131 if task != "" && task != lock.Task { 132 continue // specify task but mismatch 133 } 134 ready := lock.Ready() 135 if len(sources) > 0 { 136 for _, source := range sources { 137 if _, ok := ready[source]; ok { 138 goto FOUND // if any source matched, show lock for it. 139 } 140 } 141 continue // specify sources but mismath 142 } 143 FOUND: 144 var ( 145 owners []string 146 ddlGroups [][]string 147 ) 148 149 appendOwnerDDLs := func(opmss map[string]map[string]optimism.Operation, source string) { 150 for schema, opmsst := range opmss { 151 for table, op := range opmsst { 152 if op.ConflictStage != optimism.ConflictDetected { 153 continue 154 } 155 if _, ok := ifm[lock.Task]; !ok { 156 continue 157 } 158 if _, ok := ifm[lock.Task][source]; !ok { 159 continue 160 } 161 if _, ok := ifm[lock.Task][source][schema]; !ok { 162 continue 163 } 164 if info, ok := ifm[lock.Task][source][schema][table]; ok { 165 owners = append(owners, utils.GenDDLLockID(source, schema, table)) 166 ddlGroups = append(ddlGroups, info.DDLs) 167 } 168 } 169 } 170 } 171 if opms, ok := opm[lock.Task]; ok { 172 if len(sources) > 0 { 173 for _, source := range sources { 174 if opmss, ok := opms[source]; ok { 175 appendOwnerDDLs(opmss, source) 176 } 177 } 178 } else { 179 for source, opmss := range opms { 180 appendOwnerDDLs(opmss, source) 181 } 182 } 183 } 184 lockSynced := make([]string, 0, len(ready)) 185 lockUnsynced := make([]string, 0, len(ready)) 186 for source, schemaTables := range ready { 187 for schema, tables := range schemaTables { 188 for table, synced := range tables { 189 if synced { 190 lockSynced = append(lockSynced, fmt.Sprintf("%s-%s", source, dbutil.TableName(schema, table))) 191 } else { 192 lockUnsynced = append(lockUnsynced, fmt.Sprintf("%s-%s", source, dbutil.TableName(schema, table))) 193 } 194 } 195 } 196 } 197 sort.Strings(lockSynced) 198 sort.Strings(lockUnsynced) 199 200 if len(owners) == 0 { 201 owners = append(owners, "") 202 ddlGroups = append(ddlGroups, nil) 203 } 204 for i, owner := range owners { 205 ret = append(ret, &pb.DDLLock{ 206 ID: lock.ID, 207 Task: lock.Task, 208 Mode: config.ShardOptimistic, 209 Owner: owner, 210 DDLs: ddlGroups[i], 211 Synced: lockSynced, 212 Unsynced: lockUnsynced, 213 }) 214 } 215 } 216 return ret, err 217 } 218 219 // UnlockLock unlocks a shard DDL lock manually only when using `unlock-ddl-lock` command. 220 // ID: the shard DDL lock ID. 221 // source, upstreamSchema, upstreamTable: reveal the upstream table's info which we need to skip/exec 222 // action: whether to skip/exec the blocking DDLs for the specified upstream table 223 // NOTE: this function has side effects, if it failed, some status can't revert anymore. 224 // NOTE: this function should not be called if the lock is still in automatic resolving. 225 func (o *Optimist) UnlockLock(ctx context.Context, id, source, upstreamSchema, upstreamTable string, action pb.UnlockDDLLockOp) error { 226 o.mu.Lock() 227 defer o.mu.Unlock() 228 if o.closed { 229 return terror.ErrMasterOptimistNotStarted.Generate() 230 } 231 task := utils.ExtractTaskFromLockID(id) 232 // 1. find the lock. 233 lock := o.lk.FindLock(id) 234 if lock == nil { 235 return terror.ErrMasterLockNotFound.Generate(id) 236 } 237 238 // 2. check whether has resolved before (this often should not happen). 239 if lock.IsResolved() { 240 _, err := o.removeLock(lock) 241 return err 242 } 243 244 // 3. find out related info & operation 245 infos, ops, _, err := optimism.GetInfosOperationsByTask(o.cli, task) 246 if err != nil { 247 return terror.ErrMasterLockIsResolving.Generatef("fail to get info and operation for task %s", task) 248 } 249 l := 0 250 for i, info := range infos { 251 if info.Task == task && info.Source == source && info.UpSchema == upstreamSchema && info.UpTable == upstreamTable { 252 infos[l] = infos[i] 253 l++ 254 } 255 } 256 // TODO: change this condition after unlock ddl supports unlock several tables at one time 257 if l != 1 { 258 return terror.ErrMasterLockIsResolving.Generatef("fail to find related info for lock %s", id) 259 } 260 infos = infos[:l] 261 262 l = 0 263 for j, op := range ops { 264 if op.Task == task && op.Source == source && op.UpSchema == upstreamSchema && op.UpTable == upstreamTable { 265 // TODO: adjust waiting for redirect conflict status 266 if op.ConflictStage != optimism.ConflictDetected { 267 return terror.ErrMasterLockIsResolving.Generatef("lock %s is in %s status, not conflicted", id, op.ConflictStage) 268 } 269 ops[l] = ops[j] 270 l++ 271 } 272 } 273 // TODO: change this condition after unlock ddl supports unlock several tables at one time 274 if l != 1 { 275 return terror.ErrMasterLockIsResolving.Generatef("fail to find related operation for lock %s", id) 276 } 277 ops = ops[:l] 278 279 // 4. rewrite operation.DDLs to skip/exec DDLs 280 switch action { 281 case pb.UnlockDDLLockOp_ExecLock: 282 ops[0].DDLs = infos[0].DDLs 283 case pb.UnlockDDLLockOp_SkipLock: 284 ops[0].DDLs = ops[0].DDLs[:0] 285 } 286 ops[0].ConflictStage = optimism.ConflictUnlocked 287 288 // 5. put operation into etcd for workers to execute 289 rev, succ, err := optimism.PutOperation(o.cli, false, ops[0], ops[0].Revision+1) 290 if err != nil { 291 return err 292 } 293 if action == pb.UnlockDDLLockOp_ExecLock { 294 lock.UpdateTableAfterUnlock(infos[0]) 295 } 296 o.logger.Info("put shard DDL lock operation", zap.String("lock", id), 297 zap.Stringer("operation", ops[0]), zap.Bool("already exist", !succ), zap.Int64("revision", rev)) 298 return nil 299 } 300 301 // RemoveMetaDataWithTask removes meta data for a specified task 302 // NOTE: this function can only be used when the specified task is not running. 303 // This function only be used when --remove-meta or stop-task 304 // NOTE: For stop-task, we still delete drop columns in etcd though user may restart the task again later. 305 func (o *Optimist) RemoveMetaDataWithTask(task string) error { 306 o.mu.Lock() 307 defer o.mu.Unlock() 308 if o.closed { 309 return terror.ErrMasterOptimistNotStarted.Generate() 310 } 311 312 lockIDSet := make(map[string]struct{}) 313 314 infos, ops, _, err := optimism.GetInfosOperationsByTask(o.cli, task) 315 if err != nil { 316 return err 317 } 318 for _, info := range infos { 319 o.lk.RemoveLockByInfo(info) 320 lockIDSet[utils.GenDDLLockID(info.Task, info.DownSchema, info.DownTable)] = struct{}{} 321 } 322 for _, op := range ops { 323 o.lk.RemoveLock(op.ID) 324 } 325 326 o.lk.RemoveDownstreamMeta(task) 327 o.tk.RemoveTableByTask(task) 328 329 // clear meta data in etcd 330 _, err = optimism.DeleteInfosOperationsTablesByTask(o.cli, task, lockIDSet) 331 return err 332 } 333 334 // RemoveMetaDataWithTaskAndSources removes meta data for a specified task and sources 335 // NOTE: this function can only be used when the specified task for source is not running. 336 func (o *Optimist) RemoveMetaDataWithTaskAndSources(task string, sources ...string) error { 337 o.mu.Lock() 338 defer o.mu.Unlock() 339 if o.closed { 340 return terror.ErrMasterOptimistNotStarted.Generate() 341 } 342 343 dropColumns := make(map[string][]string) 344 345 // gets all locks for this task 346 locks := o.lk.FindLocksByTask(task) 347 for _, lock := range locks { 348 // remove table by sources for related lock 349 cols := lock.TryRemoveTableBySources(sources) 350 dropColumns[lock.ID] = cols 351 o.logger.Debug("the tables removed from the lock", zap.String("task", task), zap.Strings("sources", sources)) 352 if !lock.HasTables() { 353 o.lk.RemoveLock(lock.ID) 354 } 355 } 356 357 o.lk.RemoveDownstreamMeta(task) 358 // remove source table in table keeper 359 o.tk.RemoveTableByTaskAndSources(task, sources) 360 o.logger.Debug("the tables removed from the table keeper", zap.String("task", task), zap.Strings("source", sources)) 361 // clear meta data in etcd 362 _, err := optimism.DeleteInfosOperationsTablesByTaskAndSource(o.cli, task, sources, dropColumns) 363 return err 364 } 365 366 // run runs jobs in the background. 367 func (o *Optimist) run(ctx context.Context, revSource, revInfo, revOperation int64) error { 368 for { 369 err := o.watchSourceInfoOperation(ctx, revSource, revInfo, revOperation) 370 if etcdutil.IsRetryableError(err) { 371 retryNum := 0 372 for { 373 retryNum++ 374 select { 375 case <-ctx.Done(): 376 return nil 377 case <-time.After(500 * time.Millisecond): 378 revSource, revInfo, revOperation, err = o.rebuildLocks() 379 if err != nil { 380 o.logger.Error("fail to rebuild shard DDL lock, will retry", 381 zap.Int("retryNum", retryNum), zap.Error(err)) 382 continue 383 } 384 } 385 break 386 } 387 } else { 388 if err != nil { 389 o.logger.Error("non-retryable error occurred, optimist will quite now", zap.Error(err)) 390 } 391 return err 392 } 393 } 394 } 395 396 // rebuildLocks rebuilds shard DDL locks from etcd persistent data. 397 func (o *Optimist) rebuildLocks() (revSource, revInfo, revOperation int64, err error) { 398 o.lk.Clear() // clear all previous locks to support re-Start. 399 400 // get the history & initial source tables. 401 stm, revSource, err := optimism.GetAllSourceTables(o.cli) 402 if err != nil { 403 return 0, 0, 0, err 404 } 405 // we do not log `stm`, `ifm` and `opm` now, because they may too long in optimism mode. 406 o.logger.Info("get history initial source tables", zap.Int64("revision", revSource)) 407 o.tk.Init(stm) // re-initialize again with valid tables. 408 409 // get the history shard DDL info. 410 ifm, revInfo, err := optimism.GetAllInfo(o.cli) 411 if err != nil { 412 return 0, 0, 0, err 413 } 414 o.logger.Info("get history shard DDL info", zap.Int64("revision", revInfo)) 415 416 // get the history shard DDL lock operation. 417 // the newly operations after this GET will be received through the WATCH with `revOperation+1`, 418 opm, revOperation, err := optimism.GetAllOperations(o.cli) 419 if err != nil { 420 return 0, 0, 0, err 421 } 422 o.logger.Info("get history shard DDL lock operation", zap.Int64("revision", revOperation)) 423 424 colm, _, err := optimism.GetAllDroppedColumns(o.cli) 425 if err != nil { 426 // only log the error, and don't return it to forbid the startup of the DM-master leader. 427 // then these unexpected columns can be handled by the user. 428 o.logger.Error("fail to recover colms", log.ShortError(err)) 429 } 430 o.lk.SetDropColumns(colm) 431 432 // recover the shard DDL lock based on history shard DDL info & lock operation. 433 err = o.recoverLocks(ifm, opm) 434 if err != nil { 435 // only log the error, and don't return it to forbid the startup of the DM-master leader. 436 // then these unexpected locks can be handled by the user. 437 o.logger.Error("fail to recover locks", log.ShortError(err)) 438 } 439 o.lk.SetDropColumns(nil) 440 441 return revSource, revInfo, revOperation, nil 442 } 443 444 // sortInfos sort all infos by revision. 445 func sortInfos(ifm map[string]map[string]map[string]map[string]optimism.Info) []optimism.Info { 446 infos := make([]optimism.Info, 0, len(ifm)) 447 448 for _, ifTask := range ifm { 449 for _, ifSource := range ifTask { 450 for _, ifSchema := range ifSource { 451 for _, info := range ifSchema { 452 infos = append(infos, info) 453 } 454 } 455 } 456 } 457 458 // sort according to the Revision 459 sort.Slice(infos, func(i, j int) bool { 460 return infos[i].Revision < infos[j].Revision 461 }) 462 return infos 463 } 464 465 // recoverLocks recovers shard DDL locks based on shard DDL info and shard DDL lock operation. 466 func (o *Optimist) recoverLocks( 467 ifm map[string]map[string]map[string]map[string]optimism.Info, 468 opm map[string]map[string]map[string]map[string]optimism.Operation, 469 ) error { 470 // sort infos by revision 471 infos := sortInfos(ifm) 472 var firstErr error 473 setFirstErr := func(err error) { 474 if firstErr == nil && err != nil { 475 firstErr = err 476 } 477 } 478 479 for _, info := range infos { 480 if info.IsDeleted { 481 // TODO: handle drop table 482 continue 483 } 484 if !o.tk.SourceTableExist(info.Task, info.Source, info.UpSchema, info.UpTable, info.DownSchema, info.DownTable) { 485 continue 486 } 487 // never mark the lock operation from `done` to `not-done` when recovering. 488 err := o.handleInfo(info, true) 489 if err != nil { 490 o.logger.Error("fail to handle info while recovering locks", zap.Error(err)) 491 setFirstErr(err) 492 } 493 } 494 495 // update the done status of the lock. 496 for _, opTask := range opm { 497 for _, opSource := range opTask { 498 for _, opSchema := range opSource { 499 for _, op := range opSchema { 500 lock := o.lk.FindLock(op.ID) 501 if lock == nil { 502 o.logger.Warn("lock for the operation not found", zap.Stringer("operation", op)) 503 continue 504 } 505 if op.Done { 506 lock.TryMarkDone(op.Source, op.UpSchema, op.UpTable) 507 err := lock.DeleteColumnsByOp(op) 508 if err != nil { 509 o.logger.Error("fail to update lock columns", zap.Error(err)) 510 } 511 // should remove resolved lock or it will be kept until next DDL 512 if lock.IsResolved() { 513 o.removeLockOptional(op, lock) 514 } 515 } 516 } 517 } 518 } 519 } 520 return firstErr 521 } 522 523 // watchSourceInfoOperation watches the etcd operation for source tables, shard DDL infos and shard DDL operations. 524 func (o *Optimist) watchSourceInfoOperation( 525 pCtx context.Context, revSource, revInfo, revOperation int64, 526 ) error { 527 ctx, cancel := context.WithCancel(pCtx) 528 var wg sync.WaitGroup 529 defer func() { 530 cancel() 531 wg.Wait() 532 }() 533 534 errCh := make(chan error, 10) 535 536 // watch for source tables and handle them. 537 sourceCh := make(chan optimism.SourceTables, 10) 538 wg.Add(2) 539 go func() { 540 defer func() { 541 wg.Done() 542 close(sourceCh) 543 }() 544 optimism.WatchSourceTables(ctx, o.cli, revSource+1, sourceCh, errCh) 545 }() 546 go func() { 547 defer wg.Done() 548 o.handleSourceTables(ctx, sourceCh) 549 }() 550 551 // watch for the shard DDL info and handle them. 552 infoCh := make(chan optimism.Info, 10) 553 wg.Add(2) 554 go func() { 555 defer func() { 556 wg.Done() 557 close(infoCh) 558 }() 559 optimism.WatchInfo(ctx, o.cli, revInfo+1, infoCh, errCh) 560 }() 561 go func() { 562 defer wg.Done() 563 o.handleInfoPut(ctx, infoCh) 564 }() 565 566 // watch for the shard DDL lock operation and handle them. 567 opCh := make(chan optimism.Operation, 10) 568 wg.Add(2) 569 go func() { 570 defer func() { 571 wg.Done() 572 close(opCh) 573 }() 574 optimism.WatchOperationPut(ctx, o.cli, "", "", "", "", revOperation+1, opCh, errCh) 575 }() 576 go func() { 577 defer wg.Done() 578 o.handleOperationPut(ctx, opCh) 579 }() 580 581 select { 582 case err := <-errCh: 583 return err 584 case <-pCtx.Done(): 585 return nil 586 } 587 } 588 589 // handleSourceTables handles PUT and DELETE for source tables. 590 func (o *Optimist) handleSourceTables(ctx context.Context, sourceCh <-chan optimism.SourceTables) { 591 for { 592 select { 593 case <-ctx.Done(): 594 return 595 case st, ok := <-sourceCh: 596 if !ok { 597 return 598 } 599 o.mu.Lock() 600 addedTable, droppedTable := o.tk.Update(st) 601 // handle create table 602 for routeTable := range addedTable { 603 lock := o.lk.FindLock(utils.GenDDLLockID(st.Task, routeTable.DownSchema, routeTable.DownTable)) 604 if lock != nil { 605 lock.AddTable(st.Source, routeTable.UpSchema, routeTable.UpTable, true) 606 } 607 } 608 // handle drop table 609 for routeTable := range droppedTable { 610 lock := o.lk.FindLock(utils.GenDDLLockID(st.Task, routeTable.DownSchema, routeTable.DownTable)) 611 if lock != nil { 612 cols := lock.TryRemoveTable(st.Source, routeTable.UpSchema, routeTable.UpTable) 613 if !lock.HasTables() { 614 o.lk.RemoveLock(lock.ID) 615 } 616 _, err := optimism.DeleteInfosOperationsTablesByTable(o.cli, st.Task, st.Source, routeTable.UpSchema, routeTable.UpTable, lock.ID, cols) 617 if err != nil { 618 o.logger.Error("failed to delete etcd meta data for table", zap.String("lockID", lock.ID), zap.String("schema", routeTable.UpSchema), zap.String("table", routeTable.UpTable)) 619 } 620 } 621 } 622 o.mu.Unlock() 623 } 624 } 625 } 626 627 // handleInfoPut handles PUT and DELETE for the shard DDL info. 628 func (o *Optimist) handleInfoPut(ctx context.Context, infoCh <-chan optimism.Info) { 629 for { 630 select { 631 case <-ctx.Done(): 632 return 633 case info, ok := <-infoCh: 634 if !ok { 635 return 636 } 637 o.logger.Info("receive a shard DDL info", zap.Stringer("info", info), zap.Bool("is deleted", info.IsDeleted)) 638 639 if info.IsDeleted { 640 // this often happen after the lock resolved. 641 continue 642 } 643 644 // avoid new ddl added while previous ddl resolved and remove lock 645 // change lock granularity if needed 646 o.mu.Lock() 647 // put operation for the table. we don't set `skipDone=true` now, 648 // because in optimism mode, one table may execute/done multiple DDLs but other tables may do nothing. 649 _ = o.handleInfo(info, false) 650 o.mu.Unlock() 651 } 652 } 653 } 654 655 func (o *Optimist) handleInfo(info optimism.Info, skipDone bool) error { 656 added := o.tk.AddTable(info.Task, info.Source, info.UpSchema, info.UpTable, info.DownSchema, info.DownTable) 657 o.logger.Debug("a table added for info", zap.Bool("added", added), zap.String("info", info.ShortString())) 658 659 tts := o.tk.FindTables(info.Task, info.DownSchema, info.DownTable) 660 if tts == nil { 661 // WATCH for SourceTables may fall behind WATCH for Info although PUT earlier, 662 // so we try to get SourceTables again. 663 // NOTE: check SourceTables for `info.Source` if needed later. 664 stm, _, err := optimism.GetAllSourceTables(o.cli) 665 if err != nil { 666 o.logger.Error("fail to get source tables", log.ShortError(err)) 667 } else if tts2 := optimism.TargetTablesForTask(info.Task, info.DownSchema, info.DownTable, stm); tts2 != nil { 668 tts = tts2 669 } 670 } 671 err := o.handleLock(info, tts, skipDone) 672 if err != nil { 673 o.logger.Error("fail to handle the shard DDL lock", zap.String("info", info.ShortString()), log.ShortError(err)) 674 metrics.ReportDDLError(info.Task, metrics.InfoErrHandleLock) 675 } 676 return err 677 } 678 679 // handleOperationPut handles PUT for the shard DDL lock operations. 680 func (o *Optimist) handleOperationPut(ctx context.Context, opCh <-chan optimism.Operation) { 681 for { 682 select { 683 case <-ctx.Done(): 684 return 685 case op, ok := <-opCh: 686 if !ok { 687 return 688 } 689 o.logger.Info("receive a shard DDL lock operation", zap.Stringer("operation", op)) 690 if !op.Done { 691 o.logger.Info("the shard DDL lock operation has not done", zap.Stringer("operation", op)) 692 continue 693 } 694 695 // avoid new ddl added while previous ddl resolved and remove lock 696 // change lock granularity if needed 697 o.mu.Lock() 698 o.handleOperation(op) 699 o.mu.Unlock() 700 } 701 } 702 } 703 704 func (o *Optimist) handleOperation(op optimism.Operation) { 705 lock := o.lk.FindLock(op.ID) 706 if lock == nil { 707 o.logger.Warn("no lock for the shard DDL lock operation exist", zap.Stringer("operation", op)) 708 return 709 } 710 711 err := lock.DeleteColumnsByOp(op) 712 if err != nil { 713 o.logger.Error("fail to update lock columns", zap.Error(err)) 714 } 715 // in optimistic mode, we always try to mark a table as done after received the `done` status of the DDLs operation. 716 // NOTE: even all tables have done their previous DDLs operations, the lock may still not resolved, 717 // because these tables may have different schemas. 718 done := lock.TryMarkDone(op.Source, op.UpSchema, op.UpTable) 719 o.logger.Info("mark operation for a table as done", zap.Bool("done", done), zap.Stringer("operation", op)) 720 if !lock.IsResolved() { 721 o.logger.Info("the lock is still not resolved", zap.Stringer("operation", op)) 722 return 723 } 724 o.removeLockOptional(op, lock) 725 } 726 727 func (o *Optimist) removeLockOptional(op optimism.Operation, lock *optimism.Lock) { 728 // the lock has done, remove the lock. 729 o.logger.Info("the lock for the shard DDL lock operation has been resolved", zap.Stringer("operation", op)) 730 deleted, err := o.removeLock(lock) 731 if err != nil { 732 o.logger.Error("fail to delete the shard DDL infos and lock operations", zap.String("lock", lock.ID), log.ShortError(err)) 733 metrics.ReportDDLError(op.Task, metrics.OpErrRemoveLock) 734 } 735 if deleted { 736 o.logger.Info("the shard DDL infos and lock operations have been cleared", zap.Stringer("operation", op)) 737 } 738 } 739 740 // handleLock handles a single shard DDL lock. 741 func (o *Optimist) handleLock(info optimism.Info, tts []optimism.TargetTable, skipDone bool) error { 742 var ( 743 cfStage = optimism.ConflictNone 744 cfMsg = "" 745 ) 746 747 lockID, newDDLs, cols, err := o.lk.TrySync(o.cli, info, tts) 748 switch { 749 case info.IgnoreConflict: 750 o.logger.Warn("error occur when trying to sync for shard DDL info, this often means shard DDL conflict detected", 751 zap.String("lock", lockID), zap.String("info", info.ShortString()), zap.Bool("is deleted", info.IsDeleted), log.ShortError(err)) 752 case err != nil: 753 switch { 754 case terror.ErrShardDDLOptimismNeedSkipAndRedirect.Equal(err): 755 cfStage = optimism.ConflictSkipWaitRedirect 756 cfMsg = err.Error() 757 o.logger.Warn("Please make sure all sharding tables execute this DDL in order", log.ShortError(err)) 758 case terror.ErrShardDDLOptimismTrySyncFail.Equal(err): 759 cfStage = optimism.ConflictDetected 760 cfMsg = err.Error() 761 o.logger.Warn("conflict occur when trying to sync for shard DDL info, this often means shard DDL conflict detected", 762 zap.String("lock", lockID), zap.String("info", info.ShortString()), zap.Bool("is deleted", info.IsDeleted), log.ShortError(err)) 763 default: 764 cfStage = optimism.ConflictError // we treat any errors returned from `TrySync` as conflict detected now. 765 cfMsg = err.Error() 766 o.logger.Warn("error occur when trying to sync for shard DDL info, this often means shard DDL error happened", 767 zap.String("lock", lockID), zap.String("info", info.ShortString()), zap.Bool("is deleted", info.IsDeleted), log.ShortError(err)) 768 } 769 default: 770 o.logger.Info("the shard DDL lock returned some DDLs", 771 zap.String("lock", lockID), zap.Strings("ddls", newDDLs), zap.Strings("cols", cols), zap.String("info", info.ShortString()), zap.Bool("is deleted", info.IsDeleted)) 772 } 773 774 lock := o.lk.FindLock(lockID) 775 if lock == nil { 776 // should not happen 777 return terror.ErrMasterLockNotFound.Generate(lockID) 778 } 779 780 // check whether the lock has resolved. 781 if lock.IsResolved() { 782 // remove all operations for this shard DDL lock. 783 // this is to handle the case where dm-master exit before deleting operations for them. 784 _, err = o.removeLock(lock) 785 if err != nil { 786 return err 787 } 788 return nil 789 } 790 791 if info.IgnoreConflict { 792 return nil 793 } 794 795 op := optimism.NewOperation(lockID, lock.Task, info.Source, info.UpSchema, info.UpTable, newDDLs, cfStage, cfMsg, false, cols) 796 rev, succ, err := optimism.PutOperation(o.cli, skipDone, op, info.Revision) 797 if err != nil { 798 return err 799 } 800 o.logger.Info("put shard DDL lock operation", zap.String("lock", lockID), 801 zap.Stringer("operation", op), zap.Bool("already exist", !succ), zap.Int64("revision", rev)) 802 return nil 803 } 804 805 // removeLock removes the lock in memory and its information in etcd. 806 func (o *Optimist) removeLock(lock *optimism.Lock) (bool, error) { 807 failpoint.Inject("SleepWhenRemoveLock", func(val failpoint.Value) { 808 t := val.(int) 809 log.L().Info("wait new ddl info putted into etcd in optimistic", 810 zap.String("failpoint", "SleepWhenRemoveLock"), 811 zap.Int("max wait second", t)) 812 813 ticker := time.NewTicker(time.Second) 814 defer ticker.Stop() 815 timer := time.NewTimer(time.Duration(t) * time.Second) 816 defer timer.Stop() 817 OUTER: 818 for { 819 select { 820 case <-timer.C: 821 log.L().Info("failed to wait new DDL info", zap.Int("wait second", t)) 822 break OUTER 823 case <-ticker.C: 824 // manually check etcd 825 cmps := make([]clientv3.Cmp, 0) 826 for source, schemaTables := range lock.Ready() { 827 for schema, tables := range schemaTables { 828 for table := range tables { 829 info := optimism.NewInfo(lock.Task, source, schema, table, lock.DownSchema, lock.DownTable, nil, nil, nil) 830 info.Version = lock.GetVersion(source, schema, table) 831 key := common.ShardDDLOptimismInfoKeyAdapter.Encode(info.Task, info.Source, info.UpSchema, info.UpTable) 832 cmps = append(cmps, clientv3.Compare(clientv3.Version(key), "<", info.Version+1)) 833 } 834 } 835 } 836 resp, _, err := etcdutil.DoTxnWithRepeatable(o.cli, etcdutil.FullOpFunc(cmps, nil, nil)) 837 if err == nil && !resp.Succeeded { 838 log.L().Info("found new DDL info") 839 break OUTER 840 } 841 } 842 } 843 }) 844 deleted, err := o.deleteInfosOps(lock) 845 if err != nil { 846 return deleted, err 847 } 848 if !deleted { 849 return false, nil 850 } 851 o.lk.RemoveLock(lock.ID) 852 metrics.ReportDDLPending(lock.Task, metrics.DDLPendingSynced, metrics.DDLPendingNone) 853 return true, nil 854 } 855 856 // deleteInfosOps DELETEs shard DDL lock info and operations. 857 func (o *Optimist) deleteInfosOps(lock *optimism.Lock) (bool, error) { 858 infos := make([]optimism.Info, 0) 859 ops := make([]optimism.Operation, 0) 860 for source, schemaTables := range lock.Ready() { 861 for schema, tables := range schemaTables { 862 for table := range tables { 863 // NOTE: we rely on only `task`, `source`, `upSchema`, `upTable` and `Version` used for deletion. 864 info := optimism.NewInfo(lock.Task, source, schema, table, lock.DownSchema, lock.DownTable, nil, nil, nil) 865 info.Version = lock.GetVersion(source, schema, table) 866 infos = append(infos, info) 867 ops = append(ops, optimism.NewOperation(lock.ID, lock.Task, source, schema, table, nil, optimism.ConflictNone, "", false, nil)) 868 } 869 } 870 } 871 // NOTE: we rely on only `task`, `downSchema`, and `downTable` used for deletion. 872 rev, deleted, err := optimism.DeleteInfosOperationsColumns(o.cli, infos, ops, lock.ID) 873 if err != nil { 874 return deleted, err 875 } 876 if deleted { 877 o.logger.Info("delete shard DDL infos and lock operations", zap.String("lock", lock.ID), zap.Int64("revision", rev)) 878 } else { 879 o.logger.Info("fail to delete shard DDL infos and lock operations", zap.String("lock", lock.ID), zap.Int64("revision", rev)) 880 } 881 return deleted, nil 882 }