github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/master/shardddl/pessimist.go (about) 1 // Copyright 2020 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package shardddl 15 16 import ( 17 "context" 18 "sort" 19 "sync" 20 "time" 21 22 "github.com/pingcap/failpoint" 23 "github.com/pingcap/tiflow/dm/config" 24 "github.com/pingcap/tiflow/dm/master/metrics" 25 "github.com/pingcap/tiflow/dm/pb" 26 "github.com/pingcap/tiflow/dm/pkg/etcdutil" 27 "github.com/pingcap/tiflow/dm/pkg/log" 28 "github.com/pingcap/tiflow/dm/pkg/shardddl/pessimism" 29 "github.com/pingcap/tiflow/dm/pkg/terror" 30 clientv3 "go.etcd.io/etcd/client/v3" 31 "go.uber.org/zap" 32 ) 33 34 var ( 35 // variables to control the behavior of waiting for the operation to be done for `UnlockLock`. 36 unlockWaitInterval = time.Second 37 unlockWaitNum = 10 38 ) 39 40 // Pessimist used to coordinate the shard DDL migration in pessimism mode. 41 type Pessimist struct { 42 mu sync.Mutex 43 44 logger log.Logger 45 46 closed bool 47 cancel context.CancelFunc 48 wg sync.WaitGroup 49 50 cli *clientv3.Client 51 lk *pessimism.LockKeeper 52 53 // taskSources used to get all sources relative to the given task. 54 taskSources func(task string) []string 55 56 infoOpMu sync.Mutex 57 } 58 59 // NewPessimist creates a new Pessimist instance. 60 func NewPessimist(pLogger *log.Logger, taskSources func(task string) []string) *Pessimist { 61 return &Pessimist{ 62 logger: pLogger.WithFields(zap.String("component", "shard DDL pessimist")), 63 closed: true, // mark as closed before started. 64 lk: pessimism.NewLockKeeper(), 65 taskSources: taskSources, 66 } 67 } 68 69 // Start starts the shard DDL coordination in pessimism mode. 70 // NOTE: for logic errors, it should start without returning errors (but report via metrics or log) so that the user can fix them. 71 func (p *Pessimist) Start(pCtx context.Context, etcdCli *clientv3.Client) error { 72 p.logger.Info("the shard DDL pessimist is starting") 73 74 p.mu.Lock() 75 defer p.mu.Unlock() 76 77 p.cli = etcdCli // p.cli should be set before watching and recover locks because these operations need p.cli 78 rev1, rev2, err := p.buildLocks(etcdCli) 79 if err != nil { 80 return err 81 } 82 ctx, cancel := context.WithCancel(pCtx) 83 p.wg.Add(1) 84 go func() { 85 defer p.wg.Done() 86 // TODO: handle fatal error from run 87 //nolint:errcheck 88 p.run(ctx, etcdCli, rev1, rev2) 89 }() 90 91 p.closed = false // started now. 92 p.cancel = cancel 93 p.logger.Info("the shard DDL pessimist has started") 94 return nil 95 } 96 97 func (p *Pessimist) run(ctx context.Context, etcdCli *clientv3.Client, rev1, rev2 int64) error { 98 for { 99 err := p.watchInfoOperation(ctx, etcdCli, rev1, rev2) 100 if etcdutil.IsRetryableError(err) { 101 retryNum := 1 102 succeed := false 103 for !succeed { 104 select { 105 case <-ctx.Done(): 106 return nil 107 case <-time.After(500 * time.Millisecond): 108 rev1, rev2, err = p.buildLocks(etcdCli) 109 if err != nil { 110 log.L().Error("resetWorkerEv is failed, will retry later", zap.Error(err), zap.Int("retryNum", retryNum)) 111 } else { 112 succeed = true 113 } 114 } 115 retryNum++ 116 } 117 } else { 118 if err != nil { 119 log.L().Error("pessimist is failed and will quit now", zap.Error(err)) 120 } else { 121 log.L().Info("pessimist will quit now") 122 } 123 return err 124 } 125 } 126 } 127 128 func (p *Pessimist) buildLocks(etcdCli *clientv3.Client) (int64, int64, error) { 129 p.lk.Clear() // clear all previous locks to support re-Start. 130 131 // get the history shard DDL info. 132 // for the sequence of coordinate a shard DDL lock, see `/pkg/shardddl/pessimism/doc.go`. 133 ifm, rev1, err := pessimism.GetAllInfo(etcdCli) 134 if err != nil { 135 return 0, 0, err 136 } 137 p.logger.Info("get history shard DDL info", zap.Reflect("info", ifm), zap.Int64("revision", rev1)) 138 139 // get the history shard DDL lock operation. 140 // the newly operations after this GET will be received through the WATCH with `rev2`, 141 // and call `Lock.MarkDone` multiple times is fine. 142 opm, rev2, err := pessimism.GetAllOperations(etcdCli) 143 if err != nil { 144 return 0, 0, err 145 } 146 p.logger.Info("get history shard DDL lock operation", zap.Reflect("operation", opm), zap.Int64("revision", rev2)) 147 148 // recover the shard DDL lock based on history shard DDL info & lock operation. 149 err = p.recoverLocks(ifm, opm) 150 if err != nil { 151 // only log the error, and don't return it to forbid the startup of the DM-master leader. 152 // then these unexpected locks can be handled by the user. 153 p.logger.Error("fail to recover locks", log.ShortError(err)) 154 } 155 return rev1, rev2, nil 156 } 157 158 func (p *Pessimist) watchInfoOperation(pCtx context.Context, etcdCli *clientv3.Client, rev1, rev2 int64) error { 159 ctx, cancel := context.WithCancel(pCtx) 160 var wg sync.WaitGroup 161 defer func() { 162 cancel() 163 wg.Wait() 164 }() 165 166 // watch for the shard DDL info and handle them. 167 infoCh := make(chan pessimism.Info, 10) 168 errCh := make(chan error, 10) 169 wg.Add(2) 170 go func() { 171 defer func() { 172 wg.Done() 173 close(infoCh) 174 }() 175 pessimism.WatchInfoPut(ctx, etcdCli, rev1+1, infoCh, errCh) 176 }() 177 go func() { 178 defer wg.Done() 179 p.handleInfoPut(ctx, infoCh) 180 }() 181 182 // watch for the shard DDL lock operation and handle them. 183 opCh := make(chan pessimism.Operation, 10) 184 wg.Add(2) 185 go func() { 186 defer func() { 187 wg.Done() 188 close(opCh) 189 }() 190 pessimism.WatchOperationPut(ctx, etcdCli, "", "", rev2+1, opCh, errCh) 191 }() 192 go func() { 193 defer wg.Done() 194 p.handleOperationPut(ctx, opCh) 195 }() 196 197 select { 198 case err := <-errCh: 199 return err 200 case <-pCtx.Done(): 201 return nil 202 } 203 } 204 205 // Close closes the Pessimist instance. 206 func (p *Pessimist) Close() { 207 p.mu.Lock() 208 defer p.mu.Unlock() 209 210 if p.closed { 211 return 212 } 213 214 if p.cancel != nil { 215 p.cancel() 216 p.cancel = nil 217 } 218 219 p.wg.Wait() 220 p.closed = true // closed now. 221 p.logger.Info("the shard DDL pessimist has closed") 222 } 223 224 // Locks return all shard DDL locks current exist. 225 func (p *Pessimist) Locks() map[string]*pessimism.Lock { 226 return p.lk.Locks() 227 } 228 229 // ShowLocks is used by `show-ddl-locks` command. 230 func (p *Pessimist) ShowLocks(task string, sources []string) []*pb.DDLLock { 231 locks := p.lk.Locks() 232 ret := make([]*pb.DDLLock, 0, len(locks)) 233 for _, lock := range locks { 234 if task != "" && task != lock.Task { 235 continue // specify task but mismatch 236 } 237 ready := lock.Ready() 238 if len(sources) > 0 { 239 for _, worker := range sources { 240 if _, ok := ready[worker]; ok { 241 goto FOUND // if any source matched, show lock for it. 242 } 243 } 244 continue // specify workers but mismatch 245 } 246 FOUND: 247 l := &pb.DDLLock{ 248 ID: lock.ID, 249 Task: lock.Task, 250 Mode: config.ShardPessimistic, 251 Owner: lock.Owner, 252 DDLs: lock.DDLs, 253 Synced: make([]string, 0, len(ready)), 254 Unsynced: make([]string, 0, len(ready)), 255 } 256 for worker, synced := range ready { 257 if synced { 258 l.Synced = append(l.Synced, worker) 259 } else { 260 l.Unsynced = append(l.Unsynced, worker) 261 } 262 } 263 sort.Strings(l.Synced) 264 sort.Strings(l.Unsynced) 265 ret = append(ret, l) 266 } 267 return ret 268 } 269 270 // UnlockLock unlocks a shard DDL lock manually when using `unlock-ddl-lock` command. 271 // ID: the shard DDL lock ID. 272 // replaceOwner: the new owner used to replace the original DDL for executing DDL to downstream. 273 // 274 // if the original owner is still exist, we should NOT specify any replaceOwner. 275 // 276 // forceRemove: whether force to remove the DDL lock even fail to unlock it (for the owner). 277 // 278 // if specified forceRemove and then fail to unlock, we may need to use `BreakLock` later. 279 // 280 // NOTE: this function has side effects, if it failed, some status can't revert anymore. 281 // NOTE: this function should not be called if the lock is still in automatic resolving. 282 func (p *Pessimist) UnlockLock(ctx context.Context, id, replaceOwner string, forceRemove bool) error { 283 p.mu.Lock() 284 defer p.mu.Unlock() 285 if p.closed { 286 return terror.ErrMasterPessimistNotStarted.Generate() 287 } 288 // 1. find the lock. 289 lock := p.lk.FindLock(id) 290 if lock == nil { 291 return terror.ErrMasterLockNotFound.Generate(id) 292 } 293 294 // 2. check whether has resolved before (this often should not happen). 295 if lock.IsResolved() { 296 err := p.removeLock(lock) 297 if err != nil { 298 return err 299 } 300 return terror.ErrMasterLockIsResolving.Generatef("the lock %s has been resolved before", id) 301 } 302 303 // 3. find out synced & un-synced sources. 304 ready := lock.Ready() 305 synced := make([]string, 0, len(ready)) 306 unsynced := make([]string, 0, len(ready)) 307 for source, isSynced := range ready { 308 if isSynced { 309 synced = append(synced, source) 310 } else { 311 unsynced = append(unsynced, source) 312 } 313 } 314 sort.Strings(synced) 315 sort.Strings(unsynced) 316 p.logger.Warn("some sources are still not synced before unlock the lock", 317 zap.Strings("un-synced", unsynced), zap.Strings("synced", synced)) 318 319 // 4. check whether the owner has synced (and it must be synced if using `UnlockLock`). 320 // if no source synced yet, we should choose to use `BreakLock` instead. 321 owner := lock.Owner 322 if replaceOwner != "" { 323 p.logger.Warn("replace the owner of the lock", zap.String("lock", id), 324 zap.String("original owner", owner), zap.String("new owner", replaceOwner)) 325 owner = replaceOwner 326 } 327 if isSynced, ok := ready[owner]; !ok || !isSynced { 328 return terror.ErrMasterWorkerNotWaitLock.Generatef( 329 "owner %s is not waiting for a lock, but sources %v are waiting for the lock", owner, synced) 330 } 331 332 // 5. force to mark the lock as synced. 333 lock.ForceSynced() 334 var revertLockSync bool // revert lock's sync status if the operation for the owner is not done. 335 defer func() { 336 if revertLockSync { 337 lock.RevertSynced(unsynced) 338 p.logger.Warn("revert some sources stage to un-synced", zap.Strings("sources", unsynced)) 339 } 340 }() 341 342 // 6. put `exec` operation for the owner, and wait for the owner to be done. 343 done, err := p.waitOwnerToBeDone(ctx, lock, owner) 344 if err != nil { 345 revertLockSync = true 346 return err 347 } else if !done && !forceRemove { // if `forceRemove==true`, we still try to complete following steps. 348 revertLockSync = true 349 return terror.ErrMasterOwnerExecDDL.Generatef( 350 "the owner %s of the lock %s has not done the operation", owner, id) 351 } 352 353 // 7. put `skip` operations for other sources, and wait for them to be done. 354 // NOTE: we don't put operations for un-synced sources, 355 // because they should be not waiting for these operations. 356 done, err = p.waitNonOwnerToBeDone(ctx, lock, owner, synced) 357 if err != nil { 358 p.logger.Error("the owner has done the exec operation, but fail to wait for some other sources done the skip operation, the lock is still removed", 359 zap.String("lock", id), zap.Bool("force remove", forceRemove), zap.String("owner", owner), 360 zap.Strings("un-synced", unsynced), zap.Strings("synced", synced), zap.Error(err)) 361 } else if !done { 362 p.logger.Error("the owner has done the exec operation, but some other sources have not done the skip operation, the lock is still removed", 363 zap.String("lock", id), zap.Bool("force remove", forceRemove), zap.String("owner", owner), 364 zap.Strings("un-synced", unsynced), zap.Strings("synced", synced)) 365 } 366 367 // 8. remove or clear shard DDL lock and info. 368 p.lk.RemoveLock(id) 369 err2 := p.deleteInfosOps(lock) 370 371 switch { 372 case err != nil && err2 != nil: 373 return terror.ErrMasterPartWorkerExecDDLFail.AnnotateDelegate( 374 err, "fail to wait for non-owner sources %v to skip the shard DDL and delete shard DDL infos and operations, %s", unsynced, err2.Error()) 375 case err != nil: 376 return terror.ErrMasterPartWorkerExecDDLFail.Delegate(err, "fail to wait for non-owner sources to skip the shard DDL") 377 case err2 != nil: 378 return terror.ErrMasterPartWorkerExecDDLFail.Delegate(err2, "fail to delete shard DDL infos and operations") 379 } 380 return nil 381 } 382 383 // RemoveMetaData removes meta data for a specified task 384 // NOTE: this function can only be used when the specified task is not running. 385 func (p *Pessimist) RemoveMetaData(task string) error { 386 p.mu.Lock() 387 defer p.mu.Unlock() 388 if p.closed { 389 return terror.ErrMasterPessimistNotStarted.Generate() 390 } 391 392 infos, ops, _, err := pessimism.GetInfosOperationsByTask(p.cli, task) 393 if err != nil { 394 return err 395 } 396 for _, info := range infos { 397 p.lk.RemoveLockByInfo(info) 398 } 399 for _, op := range ops { 400 p.lk.RemoveLock(op.ID) 401 } 402 403 // clear meta data in etcd 404 _, err = pessimism.DeleteInfosOperationsByTask(p.cli, task) 405 return err 406 } 407 408 // recoverLocks recovers shard DDL locks based on shard DDL info and shard DDL lock operation. 409 func (p *Pessimist) recoverLocks(ifm map[string]map[string]pessimism.Info, opm map[string]map[string]pessimism.Operation) error { 410 // construct locks based on the shard DDL info. 411 for task, ifs := range ifm { 412 sources := p.taskSources(task) 413 // if no operation exists for the lock, we let the smallest (lexicographical order) source as the owner of the lock. 414 // if any operation exists for the lock, we let the source with `exec=true` as the owner of the lock (the logic is below). 415 for _, info := range pessimismInfoMapToSlice(ifs) { 416 _, _, _, err := p.lk.TrySync(info, sources) 417 if err != nil { 418 return err 419 } 420 } 421 } 422 423 // update locks based on the lock operation. 424 for _, ops := range opm { 425 for source, op := range ops { 426 lock := p.lk.FindLock(op.ID) 427 if lock == nil { 428 p.logger.Warn("no shard DDL lock exists for the operation", zap.Stringer("operation", op)) 429 continue 430 } 431 432 // if any operation exists, the lock must have been synced. 433 lock.ForceSynced() 434 435 if op.Done { 436 lock.MarkDone(source) 437 } 438 if op.Exec { 439 // restore the role of `owner` based on `exec` operation. 440 // This is needed because `TrySync` can only set `owner` for the first call of the lock. 441 p.logger.Info("restore the role of owner for the shard DDL lock", zap.String("lock", op.ID), zap.String("from", lock.Owner), zap.String("to", op.Source)) 442 lock.Owner = op.Source 443 } 444 } 445 } 446 447 // try to handle locks. 448 for _, lock := range p.lk.Locks() { 449 synced, remain := lock.IsSynced() 450 if !synced { 451 p.logger.Info("restored an un-synced shard DDL lock", zap.String("lock", lock.ID), zap.Int("remain", remain)) 452 continue 453 } 454 err := p.handleLock(lock.ID, "") 455 if err != nil { 456 return err 457 } 458 } 459 460 return nil 461 } 462 463 // handleInfoPut handles the shard DDL lock info PUTed. 464 func (p *Pessimist) handleInfoPut(ctx context.Context, infoCh <-chan pessimism.Info) { 465 for { 466 select { 467 case <-ctx.Done(): 468 return 469 case info, ok := <-infoCh: 470 if !ok { 471 return 472 } 473 p.logger.Info("receive a shard DDL info", zap.Stringer("info", info)) 474 475 p.infoOpMu.Lock() 476 lockID, synced, remain, err := p.lk.TrySync(info, p.taskSources(info.Task)) 477 if err != nil { 478 p.logger.Error("fail to try sync shard DDL lock", zap.Stringer("info", info), log.ShortError(err)) 479 // currently, only DDL mismatch will cause error 480 metrics.ReportDDLError(info.Task, metrics.InfoErrSyncLock) 481 p.infoOpMu.Unlock() 482 continue 483 } else if !synced { 484 p.logger.Info("the shard DDL lock has not synced", zap.String("lock", lockID), zap.Int("remain", remain)) 485 p.infoOpMu.Unlock() 486 continue 487 } 488 p.logger.Info("the shard DDL lock has synced", zap.String("lock", lockID)) 489 490 err = p.handleLock(lockID, info.Source) 491 if err != nil { 492 p.logger.Error("fail to handle the shard DDL lock", zap.String("lock", lockID), log.ShortError(err)) 493 metrics.ReportDDLError(info.Task, metrics.InfoErrHandleLock) 494 } 495 p.infoOpMu.Unlock() 496 } 497 } 498 } 499 500 // handleOperationPut handles the shard DDL lock operations PUTed. 501 func (p *Pessimist) handleOperationPut(ctx context.Context, opCh <-chan pessimism.Operation) { 502 for { 503 select { 504 case <-ctx.Done(): 505 return 506 case op, ok := <-opCh: 507 if !ok { 508 return 509 } 510 p.logger.Info("receive a shard DDL lock operation", zap.Stringer("operation", op)) 511 if !op.Done { 512 p.logger.Info("the shard DDL lock operation has not done", zap.Stringer("operation", op)) 513 continue 514 } 515 516 p.infoOpMu.Lock() 517 lock := p.lk.FindLock(op.ID) 518 if lock == nil { 519 p.logger.Warn("no lock for the shard DDL lock operation exist", zap.Stringer("operation", op)) 520 p.infoOpMu.Unlock() 521 continue 522 } else if synced, _ := lock.IsSynced(); !synced { 523 // this should not happen in normal case. 524 p.logger.Warn("the lock for the shard DDL lock operation has not synced", zap.Stringer("operation", op)) 525 metrics.ReportDDLError(op.Task, metrics.OpErrLockUnSynced) 526 p.infoOpMu.Unlock() 527 continue 528 } 529 530 // update the `done` status of the lock and check whether is resolved. 531 lock.MarkDone(op.Source) 532 if lock.IsResolved() { 533 p.logger.Info("the lock for the shard DDL lock operation has been resolved", zap.Stringer("operation", op)) 534 // remove all operations for this shard DDL lock. 535 err := p.removeLock(lock) 536 if err != nil { 537 p.logger.Error("fail to delete the shard DDL lock operations", zap.String("lock", lock.ID), log.ShortError(err)) 538 metrics.ReportDDLError(op.Task, metrics.OpErrRemoveLock) 539 } 540 p.logger.Info("the lock info for the shard DDL lock operation has been cleared", zap.Stringer("operation", op)) 541 p.infoOpMu.Unlock() 542 continue 543 } 544 545 // one of the non-owner dm-worker instance has done the operation, 546 // still need to wait for more `done` from other non-owner dm-worker instances. 547 if op.Source != lock.Owner { 548 p.logger.Info("the shard DDL lock operation of a non-owner has done", zap.Stringer("operation", op), zap.String("owner", lock.Owner)) 549 p.infoOpMu.Unlock() 550 continue 551 } 552 553 // the owner has done the operation, put `skip` operation for non-owner dm-worker instances. 554 // no need to `skipDone`, all of them should be not done just after the owner has done. 555 err := p.putOpsForNonOwner(lock, "", false) 556 if err != nil { 557 p.logger.Error("fail to put skip shard DDL lock operations for non-owner", zap.String("lock", lock.ID), log.ShortError(err)) 558 metrics.ReportDDLError(op.Task, metrics.OpErrPutNonOwnerOp) 559 } 560 p.infoOpMu.Unlock() 561 } 562 } 563 } 564 565 // handleLock handles a single shard DDL lock. 566 // if source is not empty, it means the function is triggered by an Info with the source, 567 // this is often called when the source re-PUTed again after an interrupt. 568 func (p *Pessimist) handleLock(lockID, source string) error { 569 lock := p.lk.FindLock(lockID) 570 if lock == nil { 571 return nil 572 } 573 if synced, _ := lock.IsSynced(); !synced { 574 return nil // do not handle un-synced lock now. 575 } 576 577 // check whether the lock has resolved. 578 if lock.IsResolved() { 579 // remove all operations for this shard DDL lock. 580 // this is to handle the case where dm-master exit before deleting operations for them. 581 err := p.removeLock(lock) 582 if err != nil { 583 return err 584 } 585 return nil 586 } 587 588 // check whether the owner has done. 589 if lock.IsDone(lock.Owner) { 590 // try to put the skip operation for non-owner dm-worker instances, 591 // this is to handle the case where dm-master exit before putting operations for them. 592 // use `skipDone` to avoid overwriting any existing operations. 593 err := p.putOpsForNonOwner(lock, source, true) 594 if err != nil { 595 return err 596 } 597 return nil 598 } 599 600 // put `exec=true` for the owner and skip it if already existing. 601 return p.putOpForOwner(lock, lock.Owner, true) 602 } 603 604 // putOpForOwner PUTs the shard DDL lock operation for the owner into etcd. 605 func (p *Pessimist) putOpForOwner(lock *pessimism.Lock, owner string, skipDone bool) error { 606 op := pessimism.NewOperation(lock.ID, lock.Task, owner, lock.DDLs, true, false) 607 rev, succ, err := pessimism.PutOperations(p.cli, skipDone, op) 608 if err != nil { 609 return err 610 } 611 p.logger.Info("put exec shard DDL lock operation for the owner", zap.String("lock", lock.ID), zap.String("owner", lock.Owner), zap.Bool("already done", !succ), zap.Int64("revision", rev)) 612 return nil 613 } 614 615 // putOpsForNonOwner PUTs shard DDL lock operations for non-owner dm-worker instances into etcd. 616 func (p *Pessimist) putOpsForNonOwner(lock *pessimism.Lock, onlySource string, skipDone bool) error { 617 var sources []string 618 if onlySource != "" { 619 sources = append(sources, onlySource) 620 } else { 621 for source := range lock.Ready() { 622 if source != lock.Owner { 623 sources = append(sources, source) 624 } 625 } 626 } 627 628 ops := make([]pessimism.Operation, 0, len(sources)) 629 for _, source := range sources { 630 ops = append(ops, pessimism.NewOperation(lock.ID, lock.Task, source, lock.DDLs, false, false)) 631 } 632 633 rev, succ, err := pessimism.PutOperations(p.cli, skipDone, ops...) 634 if err != nil { 635 return err 636 } 637 p.logger.Info("put skip shard DDL lock operations for non-owner", zap.String("lock", lock.ID), zap.Strings("non-owner", sources), zap.Bool("already done", !succ), zap.Int64("revision", rev)) 638 return nil 639 } 640 641 // removeLock removes the lock in memory and its information in etcd. 642 func (p *Pessimist) removeLock(lock *pessimism.Lock) error { 643 // remove all operations for this shard DDL lock. 644 if err := p.deleteOps(lock); err != nil { 645 return err 646 } 647 648 failpoint.Inject("SleepWhenRemoveLock", func(val failpoint.Value) { 649 t := val.(int) 650 log.L().Info("wait new ddl info putted into etcd in pessimistic", 651 zap.String("failpoint", "SleepWhenRemoveLock"), 652 zap.Int("max wait second", t)) 653 654 ticker := time.NewTicker(time.Second) 655 defer ticker.Stop() 656 timer := time.NewTimer(time.Duration(t) * time.Second) 657 defer timer.Stop() 658 OUTER: 659 for { 660 select { 661 case <-timer.C: 662 log.L().Info("failed to wait new DDL info", zap.Int("wait second", t)) 663 break OUTER 664 case <-ticker.C: 665 // manually check etcd 666 infos, _, err := pessimism.GetAllInfo(p.cli) 667 if err == nil { 668 if _, ok := infos[lock.Task]; ok { 669 log.L().Info("found new DDL info") 670 break OUTER 671 } 672 } 673 } 674 } 675 }) 676 p.lk.RemoveLock(lock.ID) 677 return nil 678 } 679 680 // deleteOps DELETEs shard DDL lock operations relative to the lock. 681 func (p *Pessimist) deleteOps(lock *pessimism.Lock) error { 682 ready := lock.Ready() 683 ops := make([]pessimism.Operation, 0, len(ready)) 684 for source := range ready { 685 // When deleting operations, we do not verify the value of the operation now, 686 // so simply set `exec=false` and `done=true`. 687 ops = append(ops, pessimism.NewOperation(lock.ID, lock.Task, source, lock.DDLs, false, true)) 688 } 689 rev, err := pessimism.DeleteOperations(p.cli, ops...) 690 if err != nil { 691 return err 692 } 693 p.logger.Info("delete shard DDL lock operations", zap.String("lock", lock.ID), zap.Int64("revision", rev)) 694 return err 695 } 696 697 // deleteInfos DELETEs shard DDL lock infos and operations relative to the lock. 698 func (p *Pessimist) deleteInfosOps(lock *pessimism.Lock) error { 699 ready := lock.Ready() 700 infos := make([]pessimism.Info, 0, len(ready)) 701 for source := range lock.Ready() { 702 // NOTE: we rely one the `schema` and `table` not used in `DeleteInfosOperations`. 703 infos = append(infos, pessimism.NewInfo(lock.Task, source, "", "", lock.DDLs)) 704 } 705 ops := make([]pessimism.Operation, 0, len(ready)) 706 for source := range ready { 707 // When deleting operations, we do not verify the value of the operation now, 708 // so simply set `exec=false` and `done=true`. 709 ops = append(ops, pessimism.NewOperation(lock.ID, lock.Task, source, lock.DDLs, false, true)) 710 } 711 712 rev, err := pessimism.DeleteInfosOperations(p.cli, infos, ops) 713 if err != nil { 714 return err 715 } 716 p.logger.Info("delete shard DDL infos and operations", zap.String("lock", lock.ID), zap.Int64("revision", rev)) 717 return nil 718 } 719 720 // waitOwnerToBeDone waits for the owner of the lock to be done for the `exec` operation. 721 func (p *Pessimist) waitOwnerToBeDone(ctx context.Context, lock *pessimism.Lock, owner string) (bool, error) { 722 if lock.IsDone(owner) { 723 p.logger.Info("the owner of the lock has been done before", 724 zap.String("owner", owner), zap.String("lock", lock.ID)) 725 return true, nil // done before. 726 } 727 728 // put the `exec` operation. 729 err := p.putOpForOwner(lock, owner, true) 730 if err != nil { 731 return false, err 732 } 733 734 // wait for the owner done the operation. 735 for retryNum := 1; retryNum <= unlockWaitNum; retryNum++ { 736 select { 737 case <-ctx.Done(): 738 return lock.IsDone(owner), ctx.Err() 739 case <-time.After(unlockWaitInterval): 740 } 741 if lock.IsDone(owner) { 742 break 743 } 744 p.logger.Info("retry to wait for the owner done the operation", 745 zap.String("owner", owner), zap.String("lock", lock.ID), zap.Int("retry", retryNum)) 746 } 747 748 return lock.IsDone(owner), nil 749 } 750 751 // waitNonOwnerToBeDone waits for the non-owner sources of the lock to be done for the `skip` operations. 752 func (p *Pessimist) waitNonOwnerToBeDone(ctx context.Context, lock *pessimism.Lock, owner string, sources []string) (bool, error) { 753 // check whether some sources need to wait. 754 if len(sources) == 0 { 755 p.logger.Info("no non-owner sources need to wait for the operations", zap.String("lock", lock.ID)) 756 return true, nil 757 } 758 waitSources := make([]string, 0, len(sources)-1) 759 for _, source := range sources { 760 if source != owner { 761 waitSources = append(waitSources, source) 762 } 763 } 764 if len(waitSources) == 0 { 765 p.logger.Info("no non-owner sources need to wait for the operations", zap.String("lock", lock.ID)) 766 return true, nil 767 } 768 769 // check whether already done before. 770 allDone := func() bool { 771 for _, source := range waitSources { 772 if !lock.IsDone(source) { 773 return false 774 } 775 } 776 return true 777 } 778 if allDone() { 779 p.logger.Info("non-owner sources of the lock have been done before", 780 zap.String("lock", lock.ID), zap.Strings("sources", waitSources)) 781 return true, nil 782 } 783 784 // put `skip` operations. 785 // NOTE: the auto triggered `putOpsForNonOwner` in `handleOperationPut` by the done operation of the owner 786 // may put `skip` operations for all non-owner sources, but in order to support `replace owner`, 787 // we still put `skip` operations for waitSources one more time with `skipDone=true`. 788 ops := make([]pessimism.Operation, 0, len(waitSources)) 789 for _, source := range waitSources { 790 ops = append(ops, pessimism.NewOperation(lock.ID, lock.Task, source, lock.DDLs, false, false)) 791 } 792 rev, succ, err := pessimism.PutOperations(p.cli, true, ops...) 793 if err != nil { 794 return false, err 795 } 796 p.logger.Info("put skip shard DDL lock operations for non-owner", zap.String("lock", lock.ID), zap.Strings("non-owner", waitSources), zap.Bool("already done", !succ), zap.Int64("revision", rev)) 797 798 // wait sources done the operations. 799 for retryNum := 1; retryNum <= unlockWaitNum; retryNum++ { 800 var ctxDone bool 801 select { 802 case <-ctx.Done(): 803 ctxDone = true 804 case <-time.After(unlockWaitInterval): 805 } 806 if ctxDone || allDone() { 807 break 808 } 809 p.logger.Info("retry to wait for non-owner sources done the operation", 810 zap.String("lock", lock.ID), zap.Strings("sources", waitSources), zap.Int("retry", retryNum)) 811 } 812 813 return allDone(), nil 814 }