github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/master/shardddl/pessimist_test.go (about) 1 // Copyright 2020 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package shardddl 15 16 import ( 17 "context" 18 "fmt" 19 "sync" 20 "testing" 21 "time" 22 23 "github.com/pingcap/errors" 24 "github.com/pingcap/tiflow/dm/common" 25 "github.com/pingcap/tiflow/dm/config" 26 "github.com/pingcap/tiflow/dm/pb" 27 "github.com/pingcap/tiflow/dm/pkg/log" 28 "github.com/pingcap/tiflow/dm/pkg/shardddl/pessimism" 29 "github.com/pingcap/tiflow/dm/pkg/terror" 30 "github.com/stretchr/testify/require" 31 "github.com/stretchr/testify/suite" 32 v3rpc "go.etcd.io/etcd/api/v3/v3rpc/rpctypes" 33 clientv3 "go.etcd.io/etcd/client/v3" 34 "go.etcd.io/etcd/tests/v3/integration" 35 ) 36 37 var etcdErrCompacted = v3rpc.ErrCompacted 38 39 const ( 40 noRestart = iota // do nothing in rebuildPessimist, just keep testing 41 restartOnly // restart without building new instance. mock leader role transfer 42 restartNewInstance // restart with build a new instance. mock progress restore from failure 43 ) 44 45 func TestPessimistSuite(t *testing.T) { 46 suite.Run(t, new(testPessimistSuite)) 47 } 48 49 type testPessimistSuite struct { 50 suite.Suite 51 mockCluster *integration.ClusterV3 52 etcdTestCli *clientv3.Client 53 } 54 55 func (t *testPessimistSuite) SetupSuite() { 56 require.NoError(t.T(), log.InitLogger(&log.Config{})) 57 58 integration.BeforeTestExternal(t.T()) 59 t.mockCluster = integration.NewClusterV3(t.T(), &integration.ClusterConfig{Size: 1}) 60 t.etcdTestCli = t.mockCluster.RandClient() 61 } 62 63 func (t *testPessimistSuite) TearDownSuite() { 64 t.mockCluster.Terminate(t.T()) 65 } 66 67 func (t *testPessimistSuite) TearDownTest() { 68 t.clearTestInfoOperation() 69 } 70 71 // clear keys in etcd test cluster. 72 func (t *testPessimistSuite) clearTestInfoOperation() { 73 t.T().Helper() 74 75 clearInfo := clientv3.OpDelete(common.ShardDDLPessimismInfoKeyAdapter.Path(), clientv3.WithPrefix()) 76 clearOp := clientv3.OpDelete(common.ShardDDLPessimismOperationKeyAdapter.Path(), clientv3.WithPrefix()) 77 _, err := t.etcdTestCli.Txn(context.Background()).Then(clearInfo, clearOp).Commit() 78 require.NoError(t.T(), err) 79 } 80 81 func (t *testPessimistSuite) TestPessimist() { 82 t.testPessimistProgress(noRestart) 83 t.testPessimistProgress(restartOnly) 84 t.testPessimistProgress(restartNewInstance) 85 } 86 87 func (t *testPessimistSuite) testPessimistProgress(restart int) { 88 defer t.clearTestInfoOperation() 89 90 var ( 91 watchTimeout = 3 * time.Second 92 task1 = "task-pessimist-1" 93 task2 = "task-pessimist-2" 94 source1 = "mysql-replica-1" 95 source2 = "mysql-replica-2" 96 source3 = "mysql-replica-3" 97 schema, table = "foo", "bar" 98 DDLs = []string{"ALTER TABLE bar ADD COLUMN c1 INT"} 99 ID1 = fmt.Sprintf("%s-`%s`.`%s`", task1, schema, table) 100 ID2 = fmt.Sprintf("%s-`%s`.`%s`", task2, schema, table) 101 i11 = pessimism.NewInfo(task1, source1, schema, table, DDLs) 102 i12 = pessimism.NewInfo(task1, source2, schema, table, DDLs) 103 i21 = pessimism.NewInfo(task2, source1, schema, table, DDLs) 104 i22 = pessimism.NewInfo(task2, source2, schema, table, DDLs) 105 i23 = pessimism.NewInfo(task2, source3, schema, table, DDLs) 106 107 sources = func(task string) []string { 108 switch task { 109 case task1: 110 return []string{source1, source2} 111 case task2: 112 return []string{source1, source2, source3} 113 default: 114 t.T().Fatalf("unsupported task %s", task) 115 } 116 return []string{} 117 } 118 logger = log.L() 119 p = NewPessimist(&logger, sources) 120 121 rebuildPessimist = func(ctx context.Context) { 122 switch restart { 123 case restartOnly: 124 p.Close() 125 require.NoError(t.T(), p.Start(ctx, t.etcdTestCli)) 126 case restartNewInstance: 127 p.Close() 128 p = NewPessimist(&logger, sources) 129 require.NoError(t.T(), p.Start(ctx, t.etcdTestCli)) 130 } 131 } 132 ) 133 134 ctx, cancel := context.WithCancel(context.Background()) 135 defer cancel() 136 137 // CASE 1: start without any previous kv and no etcd operation. 138 require.NoError(t.T(), p.Start(ctx, t.etcdTestCli)) 139 require.Len(t.T(), p.Locks(), 0) 140 p.Close() 141 p.Close() // close multiple times. 142 143 // CASE 2: start again without any previous kv. 144 require.NoError(t.T(), p.Start(ctx, t.etcdTestCli)) 145 require.Len(t.T(), p.Locks(), 0) 146 147 // PUT i11, will create a lock but not synced. 148 _, err := pessimism.PutInfo(t.etcdTestCli, i11) 149 require.NoError(t.T(), err) 150 require.Eventually(t.T(), func() bool { 151 return len(p.Locks()) == 1 152 }, 3*time.Second, 100*time.Millisecond) 153 require.Contains(t.T(), p.Locks(), ID1) 154 synced, remain := p.Locks()[ID1].IsSynced() 155 require.False(t.T(), synced) 156 require.Equal(t.T(), 1, remain) 157 // PUT i12, the lock will be synced, then an operation PUT for the owner will be triggered. 158 rev1, err := pessimism.PutInfo(t.etcdTestCli, i12) 159 require.NoError(t.T(), err) 160 require.Eventually(t.T(), func() bool { 161 synced, _ = p.Locks()[ID1].IsSynced() 162 return synced 163 }, 3*time.Second, 100*time.Millisecond) 164 165 // wait exec operation for the owner become available. 166 opCh := make(chan pessimism.Operation, 10) 167 errCh := make(chan error, 10) 168 ctx2, cancel2 := context.WithTimeout(ctx, watchTimeout) 169 pessimism.WatchOperationPut(ctx2, t.etcdTestCli, task1, source1, rev1+1, opCh, errCh) 170 cancel2() 171 close(opCh) 172 close(errCh) 173 require.Equal(t.T(), 0, len(errCh)) 174 require.Equal(t.T(), 1, len(opCh)) 175 op11 := <-opCh 176 require.True(t.T(), op11.Exec) 177 require.False(t.T(), op11.Done) 178 179 // mark exec operation for the owner as `done` (and delete the info). 180 op11c := op11 181 op11c.Done = true 182 done, rev2, err := pessimism.PutOperationDeleteExistInfo(t.etcdTestCli, op11c, i11) 183 require.NoError(t.T(), err) 184 require.True(t.T(), done) 185 require.Eventually(t.T(), func() bool { 186 return p.Locks()[ID1].IsDone(source1) 187 }, 3*time.Second, 100*time.Millisecond) 188 189 // wait skip operation for the non-owner become available. 190 opCh = make(chan pessimism.Operation, 10) 191 errCh = make(chan error, 10) 192 ctx2, cancel2 = context.WithTimeout(ctx, watchTimeout) 193 pessimism.WatchOperationPut(ctx2, t.etcdTestCli, task1, source2, rev2+1, opCh, errCh) 194 cancel2() 195 close(opCh) 196 close(errCh) 197 require.Equal(t.T(), 0, len(errCh)) 198 require.Equal(t.T(), 1, len(opCh)) 199 op12 := <-opCh 200 require.False(t.T(), op12.Exec) 201 require.False(t.T(), op12.Done) 202 203 // mark skip operation for the non-owner as `done` (and delete the info). 204 // the lock should become resolved and deleted. 205 op12c := op12 206 op12c.Done = true 207 done, _, err = pessimism.PutOperationDeleteExistInfo(t.etcdTestCli, op12c, i12) 208 require.NoError(t.T(), err) 209 require.True(t.T(), done) 210 require.Eventually(t.T(), func() bool { 211 _, ok := p.Locks()[ID1] 212 return !ok 213 }, 50*100*time.Millisecond, 100*time.Millisecond) 214 require.Len(t.T(), p.Locks(), 0) 215 require.Len(t.T(), p.ShowLocks("", nil), 0) 216 217 // PUT i21, i22, this will create a lock. 218 _, err = pessimism.PutInfo(t.etcdTestCli, i21) 219 require.NoError(t.T(), err) 220 _, err = pessimism.PutInfo(t.etcdTestCli, i22) 221 require.NoError(t.T(), err) 222 require.Eventually(t.T(), func() bool { 223 lock := p.Locks()[ID2] 224 if lock == nil { 225 return false 226 } 227 _, remain = lock.IsSynced() 228 return remain == 1 229 }, 3*time.Second, 100*time.Millisecond) 230 231 // CASE 3: start again with some previous shard DDL info and the lock is un-synced. 232 rebuildPessimist(ctx) 233 require.Len(t.T(), p.Locks(), 1) 234 require.Contains(t.T(), p.Locks(), ID2) 235 synced, remain = p.Locks()[ID2].IsSynced() 236 require.False(t.T(), synced) 237 require.Equal(t.T(), 1, remain) 238 // check ShowLocks. 239 expectedLock := []*pb.DDLLock{ 240 { 241 ID: ID2, 242 Task: i21.Task, 243 Mode: config.ShardPessimistic, 244 Owner: i21.Source, 245 DDLs: i21.DDLs, 246 Synced: []string{i21.Source, i22.Source}, 247 Unsynced: []string{i23.Source}, 248 }, 249 } 250 require.Equal(t.T(), expectedLock, p.ShowLocks("", []string{})) 251 require.Equal(t.T(), expectedLock, p.ShowLocks(i21.Task, []string{})) 252 require.Equal(t.T(), expectedLock, p.ShowLocks("", []string{i21.Source})) 253 require.Equal(t.T(), expectedLock, p.ShowLocks("", []string{i23.Source})) 254 require.Equal(t.T(), expectedLock, p.ShowLocks("", []string{i22.Source, i23.Source})) 255 require.Equal(t.T(), expectedLock, p.ShowLocks(i21.Task, []string{i22.Source, i23.Source})) 256 require.Len(t.T(), p.ShowLocks("not-exist", []string{}), 0) 257 require.Len(t.T(), p.ShowLocks("", []string{"not-exist"}), 0) 258 259 // PUT i23, then the lock will become synced. 260 rev3, err := pessimism.PutInfo(t.etcdTestCli, i23) 261 require.NoError(t.T(), err) 262 require.Eventually(t.T(), func() bool { 263 synced, _ = p.Locks()[ID2].IsSynced() 264 return synced 265 }, 3*time.Second, 100*time.Millisecond) 266 267 // wait exec operation for the owner become available. 268 opCh = make(chan pessimism.Operation, 10) 269 errCh = make(chan error, 10) 270 ctx2, cancel2 = context.WithTimeout(ctx, watchTimeout) 271 // both source1 and source2 have shard DDL info exist, and neither of them have operation exist. 272 // we must ensure source1 always become the owner of the lock. 273 pessimism.WatchOperationPut(ctx2, t.etcdTestCli, task2, source1, rev3+1, opCh, errCh) 274 cancel2() 275 close(opCh) 276 close(errCh) 277 require.Len(t.T(), errCh, 0) 278 require.Len(t.T(), opCh, 1) 279 op21 := <-opCh 280 require.True(t.T(), op21.Exec) 281 require.False(t.T(), op21.Done) 282 283 // CASE 4: start again with some previous shard DDL info and non-`done` operation. 284 rebuildPessimist(ctx) 285 require.Len(t.T(), p.Locks(), 1) 286 require.Contains(t.T(), p.Locks(), ID2) 287 synced, _ = p.Locks()[ID2].IsSynced() 288 require.True(t.T(), synced) 289 require.False(t.T(), p.Locks()[ID2].IsDone(source1)) 290 291 // mark exec operation for the owner as `done` (and delete the info). 292 op21c := op21 293 op21c.Done = true 294 done, _, err = pessimism.PutOperationDeleteExistInfo(t.etcdTestCli, op21c, i21) 295 require.NoError(t.T(), err) 296 require.True(t.T(), done) 297 require.Eventually(t.T(), func() bool { 298 return p.Locks()[ID2].IsDone(source1) 299 }, 3*time.Second, 100*time.Millisecond) 300 301 // CASE 5: start again with some previous shard DDL info and `done` operation for the owner. 302 rebuildPessimist(ctx) 303 require.Len(t.T(), p.Locks(), 1) 304 require.Contains(t.T(), p.Locks(), ID2) 305 synced, _ = p.Locks()[ID2].IsSynced() 306 require.True(t.T(), synced) 307 require.True(t.T(), p.Locks()[ID2].IsDone(source1)) 308 require.False(t.T(), p.Locks()[ID2].IsDone(source2)) 309 310 // mark exec operation for one non-owner as `done` (and delete the info). 311 op22c := pessimism.NewOperation(ID2, task2, source2, DDLs, false, true) 312 done, _, err = pessimism.PutOperationDeleteExistInfo(t.etcdTestCli, op22c, i22) 313 require.NoError(t.T(), err) 314 require.True(t.T(), done) 315 require.Eventually(t.T(), func() bool { 316 return p.Locks()[ID2].IsDone(source2) 317 }, 3*time.Second, 100*time.Millisecond) 318 319 // CASE 6: start again with some previous shard DDL info and `done` operation for the owner and non-owner. 320 rebuildPessimist(ctx) 321 require.Len(t.T(), p.Locks(), 1) 322 require.Contains(t.T(), p.Locks(), ID2) 323 synced, _ = p.Locks()[ID2].IsSynced() 324 require.True(t.T(), synced) 325 require.True(t.T(), p.Locks()[ID2].IsDone(source1)) 326 require.True(t.T(), p.Locks()[ID2].IsDone(source2)) 327 require.False(t.T(), p.Locks()[ID2].IsDone(source3)) 328 329 // mark skip operation for the non-owner as `done` (and delete the info). 330 // the lock should become resolved and deleted. 331 op23c := pessimism.NewOperation(ID2, task2, source3, DDLs, false, true) 332 done, _, err = pessimism.PutOperationDeleteExistInfo(t.etcdTestCli, op23c, i23) 333 require.NoError(t.T(), err) 334 require.True(t.T(), done) 335 require.Eventually(t.T(), func() bool { 336 _, ok := p.Locks()[ID2] 337 return !ok 338 }, 3*time.Second, 100*time.Millisecond) 339 require.Len(t.T(), p.Locks(), 0) 340 341 // CASE 7: start again after all shard DDL locks have been resolved. 342 rebuildPessimist(ctx) 343 require.Len(t.T(), p.Locks(), 0) 344 p.Close() // close the Pessimist. 345 } 346 347 func (t *testPessimistSuite) TestSourceReEntrant() { 348 // sources (owner or non-owner) may be interrupted and re-run the sequence again. 349 var ( 350 watchTimeout = 3 * time.Second 351 task = "task-source-re-entrant" 352 source1 = "mysql-replica-1" 353 source2 = "mysql-replica-2" 354 source3 = "mysql-replica-3" 355 schema, table = "foo", "bar" 356 DDLs = []string{"ALTER TABLE bar ADD COLUMN c1 INT"} 357 ID = fmt.Sprintf("%s-`%s`.`%s`", task, schema, table) 358 i11 = pessimism.NewInfo(task, source1, schema, table, DDLs) 359 i12 = pessimism.NewInfo(task, source2, schema, table, DDLs) 360 i13 = pessimism.NewInfo(task, source3, schema, table, DDLs) 361 362 sources = func(task string) []string { 363 switch task { 364 case task: 365 return []string{source1, source2, source3} 366 default: 367 t.T().Fatalf("unsupported task %s", task) 368 } 369 return []string{} 370 } 371 logger = log.L() 372 p = NewPessimist(&logger, sources) 373 ) 374 375 ctx, cancel := context.WithCancel(context.Background()) 376 defer cancel() 377 378 // 0. start the pessimist. 379 require.NoError(t.T(), p.Start(ctx, t.etcdTestCli)) 380 require.Len(t.T(), p.Locks(), 0) 381 defer p.Close() 382 383 // 1. PUT i11 and i12, will create a lock but not synced. 384 _, err := pessimism.PutInfo(t.etcdTestCli, i11) 385 require.NoError(t.T(), err) 386 _, err = pessimism.PutInfo(t.etcdTestCli, i12) 387 require.NoError(t.T(), err) 388 require.Eventually(t.T(), func() bool { 389 lock := p.Locks()[ID] 390 if lock == nil { 391 return false 392 } 393 _, remain := lock.IsSynced() 394 return remain == 1 395 }, 3*time.Second, 100*time.Millisecond) 396 397 // 2. re-PUT i11, to simulate the re-entrant of the owner before the lock become synced. 398 rev1, err := pessimism.PutInfo(t.etcdTestCli, i11) 399 require.NoError(t.T(), err) 400 401 // 3. re-PUT i12, to simulate the re-entrant of the non-owner before the lock become synced. 402 rev2, err := pessimism.PutInfo(t.etcdTestCli, i12) 403 require.NoError(t.T(), err) 404 405 // 4. wait exec operation for the owner become available. 406 var wg sync.WaitGroup 407 wg.Add(1) 408 go func() { 409 defer wg.Done() 410 opCh := make(chan pessimism.Operation, 10) 411 errCh := make(chan error, 10) 412 ctx2, cancel2 := context.WithTimeout(ctx, watchTimeout) 413 pessimism.WatchOperationPut(ctx2, t.etcdTestCli, task, source1, rev1+1, opCh, errCh) 414 cancel2() 415 close(opCh) 416 close(errCh) 417 require.Len(t.T(), errCh, 0) 418 require.Len(t.T(), opCh, 1) 419 op := <-opCh 420 require.True(t.T(), op.Exec) 421 require.False(t.T(), op.Done) 422 }() 423 424 // 5. put i13, the lock will become synced, then an operation PUT for the owner will be triggered. 425 _, err = pessimism.PutInfo(t.etcdTestCli, i13) 426 require.NoError(t.T(), err) 427 wg.Wait() 428 429 // 6. re-PUT i11, to simulate the re-entrant of the owner after the lock become synced. 430 rev1, err = pessimism.PutInfo(t.etcdTestCli, i11) 431 require.NoError(t.T(), err) 432 433 // 8. wait exec operation for the owner become available again (with new revision). 434 opCh := make(chan pessimism.Operation, 10) 435 errCh := make(chan error, 10) 436 ctx2, cancel2 := context.WithTimeout(ctx, watchTimeout) 437 pessimism.WatchOperationPut(ctx2, t.etcdTestCli, task, source1, rev1+1, opCh, errCh) 438 cancel2() 439 close(opCh) 440 close(errCh) 441 require.Len(t.T(), errCh, 0) 442 require.Len(t.T(), opCh, 1) 443 op11 := <-opCh 444 require.True(t.T(), op11.Exec) 445 require.False(t.T(), op11.Done) 446 447 // 9. wait exec operation for the non-owner become available. 448 wg.Add(1) 449 go func() { 450 defer wg.Done() 451 opCh = make(chan pessimism.Operation, 10) 452 errCh = make(chan error, 10) 453 ctx2, cancel2 = context.WithTimeout(ctx, watchTimeout) 454 pessimism.WatchOperationPut(ctx2, t.etcdTestCli, task, source2, rev2+1, opCh, errCh) 455 cancel2() 456 close(opCh) 457 close(errCh) 458 require.Len(t.T(), errCh, 0) 459 require.Len(t.T(), opCh, 1) 460 op := <-opCh 461 require.False(t.T(), op.Exec) 462 require.False(t.T(), op.Done) 463 }() 464 465 // 10. mark exec operation for the owner as `done` (and delete the info). 466 op11c := op11 467 op11c.Done = true 468 done, _, err := pessimism.PutOperationDeleteExistInfo(t.etcdTestCli, op11c, i11) 469 require.NoError(t.T(), err) 470 require.True(t.T(), done) 471 require.Eventually(t.T(), func() bool { 472 return p.Locks()[ID].IsDone(source1) 473 }, 3*time.Second, 100*time.Millisecond) 474 wg.Wait() 475 476 // 11. re-PUT i12, to simulate the re-entrant of the non-owner after the lock become synced. 477 rev2, err = pessimism.PutInfo(t.etcdTestCli, i12) 478 require.NoError(t.T(), err) 479 480 // 12. wait skip operation for the non-owner become available again (with new revision, without existing done). 481 opCh = make(chan pessimism.Operation, 10) 482 errCh = make(chan error, 10) 483 ctx2, cancel2 = context.WithTimeout(ctx, watchTimeout) 484 pessimism.WatchOperationPut(ctx2, t.etcdTestCli, task, source2, rev2+1, opCh, errCh) 485 cancel2() 486 close(opCh) 487 close(errCh) 488 require.Len(t.T(), errCh, 0) 489 require.Len(t.T(), opCh, 1) 490 op12 := <-opCh 491 require.False(t.T(), op12.Exec) 492 require.False(t.T(), op12.Done) 493 494 // 13. mark skip operation for the non-owner as `done` (and delete the info). 495 op12c := op12 496 op12c.Done = true 497 done, _, err = pessimism.PutOperationDeleteExistInfo(t.etcdTestCli, op12c, i12) 498 require.NoError(t.T(), err) 499 require.True(t.T(), done) 500 require.Eventually(t.T(), func() bool { 501 return p.Locks()[ID].IsDone(source2) 502 }, 3*time.Second, 100*time.Millisecond) 503 504 // 14. re-PUT i13, to simulate the re-entrant of the owner after the lock become synced. 505 rev3, err := pessimism.PutInfo(t.etcdTestCli, i13) 506 require.NoError(t.T(), err) 507 508 // 15. wait skip operation for the non-owner become available again (with new revision, with existing done). 509 opCh = make(chan pessimism.Operation, 10) 510 errCh = make(chan error, 10) 511 ctx2, cancel2 = context.WithTimeout(ctx, watchTimeout) 512 pessimism.WatchOperationPut(ctx2, t.etcdTestCli, task, source3, rev3+1, opCh, errCh) 513 cancel2() 514 close(opCh) 515 close(errCh) 516 require.Len(t.T(), errCh, 0) 517 require.Len(t.T(), opCh, 1) 518 op13 := <-opCh 519 require.False(t.T(), op13.Exec) 520 require.False(t.T(), op13.Done) 521 522 // 16. mark skip operation for the non-owner as `done` (and delete the info). 523 // the lock should become resolved now. 524 op13c := op13 525 op13c.Done = true 526 done, _, err = pessimism.PutOperationDeleteExistInfo(t.etcdTestCli, op13c, i13) 527 require.NoError(t.T(), err) 528 require.True(t.T(), done) 529 require.Eventually(t.T(), func() bool { 530 _, ok := p.Locks()[ID] 531 return !ok 532 }, 3*time.Second, 100*time.Millisecond) 533 t.noLockExist(p) 534 } 535 536 func (t *testPessimistSuite) TestUnlockSourceMissBeforeSynced() { 537 // some sources may be deleted (miss) before the lock become synced. 538 539 oriUnlockWaitOwnerInterval := unlockWaitInterval 540 unlockWaitInterval = 100 * time.Millisecond 541 defer func() { 542 unlockWaitInterval = oriUnlockWaitOwnerInterval 543 }() 544 545 var ( 546 watchTimeout = 3 * time.Second 547 task = "task-unlock-source-lack-before-synced" 548 source1 = "mysql-replica-1" 549 source2 = "mysql-replica-2" 550 source3 = "mysql-replica-3" 551 schema, table = "foo", "bar" 552 DDLs = []string{"ALTER TABLE bar ADD COLUMN c1 INT"} 553 ID = fmt.Sprintf("%s-`%s`.`%s`", task, schema, table) 554 i11 = pessimism.NewInfo(task, source1, schema, table, DDLs) 555 i12 = pessimism.NewInfo(task, source2, schema, table, DDLs) 556 557 sources = func(task string) []string { 558 switch task { 559 case task: 560 return []string{source1, source2, source3} 561 default: 562 t.T().Fatalf("unsupported task %s", task) 563 } 564 return []string{} 565 } 566 logger = log.L() 567 p = NewPessimist(&logger, sources) 568 ) 569 570 ctx, cancel := context.WithCancel(context.Background()) 571 defer cancel() 572 573 // 0. start the pessimist. 574 require.True(t.T(), terror.ErrMasterPessimistNotStarted.Equal(p.UnlockLock(ctx, ID, "", false))) 575 require.NoError(t.T(), p.Start(ctx, t.etcdTestCli)) 576 require.Len(t.T(), p.Locks(), 0) 577 defer p.Close() 578 579 // no lock need to be unlock now. 580 require.True(t.T(), terror.ErrMasterLockNotFound.Equal(p.UnlockLock(ctx, ID, "", false))) 581 582 // 1. PUT i11 & i12, will create a lock but now synced. 583 // not PUT info for source3 to simulate the deletion of it. 584 _, err := pessimism.PutInfo(t.etcdTestCli, i11) 585 require.NoError(t.T(), err) 586 rev1, err := pessimism.PutInfo(t.etcdTestCli, i12) 587 require.NoError(t.T(), err) 588 require.Eventually(t.T(), func() bool { 589 if len(p.Locks()) != 1 { 590 return false 591 } 592 _, remain := p.Locks()[ID].IsSynced() 593 return remain == 1 594 }, 3*time.Second, 100*time.Millisecond) 595 require.Contains(t.T(), p.Locks(), ID) 596 synced, _ := p.Locks()[ID].IsSynced() 597 require.False(t.T(), synced) 598 ready := p.Locks()[ID].Ready() 599 require.Len(t.T(), ready, 3) 600 require.True(t.T(), ready[source1]) 601 require.True(t.T(), ready[source2]) 602 require.False(t.T(), ready[source3]) 603 604 // 2. try to unlock the lock manually, but the owner has not done the operation. 605 // this will put `exec` operation for the done. 606 require.True(t.T(), terror.ErrMasterOwnerExecDDL.Equal(p.UnlockLock(ctx, ID, "", false))) 607 608 // 3. try to unlock the lock manually, and the owner done the operation. 609 var wg sync.WaitGroup 610 wg.Add(2) 611 go func() { 612 defer wg.Done() 613 // put done for the owner. 614 t.putDoneForSource(ctx, task, source1, i11, true, rev1+1, watchTimeout) 615 }() 616 go func() { 617 defer wg.Done() 618 // put done for the synced `source2`, no need to put done for the un-synced `source3`. 619 t.putDoneForSource(ctx, task, source2, i12, false, rev1+1, watchTimeout) 620 }() 621 require.NoError(t.T(), p.UnlockLock(ctx, ID, "", false)) 622 wg.Wait() 623 624 // 4. the lock should be removed now. 625 t.noLockExist(p) 626 } 627 628 func (t *testPessimistSuite) TestUnlockSourceInterrupt() { 629 // operations may be done but not be deleted, and then interrupted. 630 631 oriUnlockWaitOwnerInterval := unlockWaitInterval 632 unlockWaitInterval = 100 * time.Millisecond 633 defer func() { 634 unlockWaitInterval = oriUnlockWaitOwnerInterval 635 }() 636 637 var ( 638 watchTimeout = 3 * time.Second 639 task = "task-unlock-source-interrupt" 640 source1 = "mysql-replica-1" 641 source2 = "mysql-replica-2" 642 schema, table = "foo", "bar" 643 DDLs = []string{"ALTER TABLE bar ADD COLUMN c1 INT"} 644 ID = fmt.Sprintf("%s-`%s`.`%s`", task, schema, table) 645 i11 = pessimism.NewInfo(task, source1, schema, table, DDLs) 646 i12 = pessimism.NewInfo(task, source2, schema, table, DDLs) 647 648 sources = func(task string) []string { 649 switch task { 650 case task: 651 return []string{source1, source2} 652 default: 653 t.T().Fatalf("unsupported task %s", task) 654 } 655 return []string{} 656 } 657 logger = log.L() 658 p = NewPessimist(&logger, sources) 659 ) 660 661 ctx, cancel := context.WithCancel(context.Background()) 662 defer cancel() 663 664 // 0. start the pessimist. 665 require.NoError(t.T(), p.Start(ctx, t.etcdTestCli)) 666 require.Len(t.T(), p.Locks(), 0) 667 defer p.Close() 668 669 // CASE 1: owner interrupted. 670 // 1. PUT i11 & i12, will create a lock and synced. 671 rev1, err := pessimism.PutInfo(t.etcdTestCli, i11) 672 require.NoError(t.T(), err) 673 _, err = pessimism.PutInfo(t.etcdTestCli, i12) 674 require.NoError(t.T(), err) 675 require.Eventually(t.T(), func() bool { 676 if len(p.Locks()) != 1 { 677 return false 678 } 679 synced, remain := p.Locks()[ID].IsSynced() 680 return synced && remain == 0 681 }, 3*time.Second, 100*time.Millisecond) 682 require.Contains(t.T(), p.Locks(), ID) 683 ready := p.Locks()[ID].Ready() 684 require.Len(t.T(), ready, 2) 685 require.True(t.T(), ready[source1]) 686 require.True(t.T(), ready[source2]) 687 688 // 2. watch until get not-done operation for the owner. 689 opCh := make(chan pessimism.Operation, 10) 690 errCh := make(chan error, 10) 691 ctx2, cancel2 := context.WithTimeout(ctx, watchTimeout) 692 pessimism.WatchOperationPut(ctx2, t.etcdTestCli, task, "", rev1+1, opCh, errCh) 693 cancel2() 694 close(opCh) 695 close(errCh) 696 require.Len(t.T(), errCh, 0) 697 require.Len(t.T(), opCh, 1) 698 op := <-opCh 699 require.Equal(t.T(), source1, op.Source) 700 require.True(t.T(), op.Exec) 701 require.False(t.T(), op.Done) 702 require.False(t.T(), p.Locks()[ID].IsResolved()) 703 704 // 3. try to unlock the lock, but no `done` marked for the owner. 705 require.True(t.T(), terror.ErrMasterOwnerExecDDL.Equal(p.UnlockLock(ctx, ID, "", false))) 706 require.False(t.T(), p.Locks()[ID].IsResolved()) 707 708 // 4. force to remove the lock even no `done` marked for the owner. 709 require.NoError(t.T(), p.UnlockLock(ctx, ID, "", true)) 710 t.noLockExist(p) 711 712 // CASE 2: non-owner interrupted. 713 // 1. PUT i11 & i12, will create a lock and synced. 714 rev1, err = pessimism.PutInfo(t.etcdTestCli, i11) 715 require.NoError(t.T(), err) 716 _, err = pessimism.PutInfo(t.etcdTestCli, i12) 717 require.NoError(t.T(), err) 718 require.Eventually(t.T(), func() bool { 719 if len(p.Locks()) != 1 { 720 return false 721 } 722 synced, remain := p.Locks()[ID].IsSynced() 723 return synced && remain == 0 724 }, 3*time.Second, 100*time.Millisecond) 725 require.Contains(t.T(), p.Locks(), ID) 726 ready = p.Locks()[ID].Ready() 727 require.Len(t.T(), ready, 2) 728 require.True(t.T(), ready[source1]) 729 require.True(t.T(), ready[source2]) 730 731 // 2. putDone for the owner. 732 t.putDoneForSource(ctx, task, source1, i11, true, rev1+1, watchTimeout) 733 require.Eventually(t.T(), func() bool { 734 return p.Locks()[ID].IsDone(source1) 735 }, 3*time.Second, 100*time.Millisecond) 736 require.False(t.T(), p.Locks()[ID].IsDone(source2)) 737 738 // 3. unlock the lock. 739 require.NoError(t.T(), p.UnlockLock(ctx, ID, "", false)) 740 t.noLockExist(p) 741 } 742 743 func (t *testPessimistSuite) TestUnlockSourceOwnerRemoved() { 744 // the owner may be deleted before the lock become synced. 745 746 oriUnlockWaitOwnerInterval := unlockWaitInterval 747 unlockWaitInterval = 100 * time.Millisecond 748 defer func() { 749 unlockWaitInterval = oriUnlockWaitOwnerInterval 750 }() 751 752 var ( 753 watchTimeout = 3 * time.Second 754 task = "task-unlock-source-owner-removed" 755 source1 = "mysql-replica-1" 756 source2 = "mysql-replica-2" 757 source3 = "mysql-replica-3" 758 schema, table = "foo", "bar" 759 DDLs = []string{"ALTER TABLE bar ADD COLUMN c1 INT"} 760 ID = fmt.Sprintf("%s-`%s`.`%s`", task, schema, table) 761 i11 = pessimism.NewInfo(task, source1, schema, table, DDLs) 762 i12 = pessimism.NewInfo(task, source2, schema, table, DDLs) 763 764 sources = func(task string) []string { 765 switch task { 766 case task: 767 return []string{source1, source2, source3} 768 default: 769 t.T().Fatalf("unsupported task %s", task) 770 } 771 return []string{} 772 } 773 logger = log.L() 774 p = NewPessimist(&logger, sources) 775 ) 776 777 ctx, cancel := context.WithCancel(context.Background()) 778 defer cancel() 779 780 // 0. start the pessimist. 781 require.NoError(t.T(), p.Start(ctx, t.etcdTestCli)) 782 require.Len(t.T(), p.Locks(), 0) 783 defer p.Close() 784 785 // no lock need to be unlock now. 786 require.True(t.T(), terror.ErrMasterLockNotFound.Equal(p.UnlockLock(ctx, ID, "", false))) 787 788 // 1. PUT i11 & i12, will create a lock but now synced. 789 _, err := pessimism.PutInfo(t.etcdTestCli, i11) 790 require.NoError(t.T(), err) 791 rev1, err := pessimism.PutInfo(t.etcdTestCli, i12) 792 require.NoError(t.T(), err) 793 require.Eventually(t.T(), func() bool { 794 if len(p.Locks()) != 1 { 795 return false 796 } 797 _, remain := p.Locks()[ID].IsSynced() 798 return remain == 1 799 }, 3*time.Second, 100*time.Millisecond) 800 require.Contains(t.T(), p.Locks(), ID) 801 synced, _ := p.Locks()[ID].IsSynced() 802 require.False(t.T(), synced) 803 ready := p.Locks()[ID].Ready() 804 require.Len(t.T(), ready, 3) 805 require.True(t.T(), ready[source1]) 806 require.True(t.T(), ready[source2]) 807 require.False(t.T(), ready[source3]) 808 809 // 2. try to unlock the lock with an un-synced replace owner. 810 require.True(t.T(), terror.ErrMasterWorkerNotWaitLock.Equal(p.UnlockLock(ctx, ID, source3, false))) 811 812 // 3. try to unlock the lock with a synced replace owner, but the replace owner has not done the operation. 813 // this will put `exec` operation for the done. 814 require.True(t.T(), terror.ErrMasterOwnerExecDDL.Equal(p.UnlockLock(ctx, ID, source2, false))) 815 816 // 4. put done for the replace owner then can unlock the lock. 817 var wg sync.WaitGroup 818 wg.Add(1) 819 go func() { 820 defer wg.Done() 821 t.putDoneForSource(ctx, task, source2, i11, true, rev1+1, watchTimeout) 822 }() 823 require.NoError(t.T(), p.UnlockLock(ctx, ID, source2, false)) 824 wg.Wait() 825 826 // 4. the lock should be removed now. 827 t.noLockExist(p) 828 } 829 830 func (t *testPessimistSuite) TestMeetEtcdCompactError() { 831 var ( 832 watchTimeout = 3 * time.Second 833 task1 = "task-pessimist-1" 834 task2 = "task-pessimist-2" 835 source1 = "mysql-replica-1" 836 source2 = "mysql-replica-2" 837 source3 = "mysql-replica-3" 838 schema, table = "foo", "bar" 839 DDLs = []string{"ALTER TABLE bar ADD COLUMN c1 INT"} 840 ID1 = fmt.Sprintf("%s-`%s`.`%s`", task1, schema, table) 841 i11 = pessimism.NewInfo(task1, source1, schema, table, DDLs) 842 i12 = pessimism.NewInfo(task1, source2, schema, table, DDLs) 843 op = pessimism.NewOperation(ID1, task1, source1, DDLs, true, false) 844 revCompacted int64 845 846 infoCh chan pessimism.Info 847 opCh chan pessimism.Operation 848 errCh chan error 849 err error 850 851 sources = func(task string) []string { 852 switch task { 853 case task1: 854 return []string{source1, source2} 855 case task2: 856 return []string{source1, source2, source3} 857 default: 858 t.T().Fatalf("unsupported task %s", task) 859 } 860 return []string{} 861 } 862 logger = log.L() 863 p = NewPessimist(&logger, sources) 864 ) 865 866 ctx, cancel := context.WithCancel(context.Background()) 867 defer cancel() 868 p.cli = t.etcdTestCli 869 870 for i := 0; i <= 1; i++ { 871 // i == 0, watch info is compacted; i == 1, watch operation is compacted 872 // step 1: trigger an etcd compaction 873 if i == 0 { 874 revCompacted, err = pessimism.PutInfo(t.etcdTestCli, i11) 875 } else { 876 var putted bool 877 revCompacted, putted, err = pessimism.PutOperations(t.etcdTestCli, false, op) 878 require.True(t.T(), putted) 879 } 880 require.NoError(t.T(), err) 881 if i == 0 { 882 _, err = pessimism.DeleteInfosOperations(t.etcdTestCli, []pessimism.Info{i11}, []pessimism.Operation{}) 883 } else { 884 _, err = pessimism.DeleteOperations(t.etcdTestCli, op) 885 } 886 require.NoError(t.T(), err) 887 revThreshold, err := pessimism.PutInfo(t.etcdTestCli, i11) 888 require.NoError(t.T(), err) 889 _, err = t.etcdTestCli.Compact(ctx, revThreshold) 890 require.NoError(t.T(), err) 891 892 infoCh = make(chan pessimism.Info, 10) 893 errCh = make(chan error, 10) 894 ctx1, cancel1 := context.WithTimeout(ctx, time.Second) 895 if i == 0 { 896 pessimism.WatchInfoPut(ctx1, t.etcdTestCli, revCompacted, infoCh, errCh) 897 } else { 898 pessimism.WatchOperationPut(ctx1, t.etcdTestCli, "", "", revCompacted, opCh, errCh) 899 } 900 cancel1() 901 select { 902 case err = <-errCh: 903 require.Equal(t.T(), etcdErrCompacted, err) 904 case <-time.After(300 * time.Millisecond): 905 t.T().Fatal("fail to get etcd error compacted") 906 } 907 908 // step 2: start running, i11 and i12 should be handled successfully 909 ctx2, cancel2 := context.WithCancel(ctx) 910 var wg sync.WaitGroup 911 wg.Add(1) 912 go func() { 913 defer wg.Done() 914 rev1, rev2 := revCompacted, revThreshold 915 if i == 1 { 916 rev1, rev2 = rev2, rev1 917 } 918 require.NoError(t.T(), p.run(ctx2, t.etcdTestCli, rev1, rev2)) 919 }() 920 // PUT i11, will create a lock but not synced. 921 require.Eventually(t.T(), func() bool { 922 return len(p.Locks()) == 1 923 }, 3*time.Second, 100*time.Millisecond) 924 require.Contains(t.T(), p.Locks(), ID1) 925 synced, remain := p.Locks()[ID1].IsSynced() 926 require.False(t.T(), synced) 927 require.Equal(t.T(), 1, remain) 928 929 // PUT i12, the lock will be synced, then an operation PUT for the owner will be triggered. 930 rev1, err := pessimism.PutInfo(t.etcdTestCli, i12) 931 require.NoError(t.T(), err) 932 require.Eventually(t.T(), func() bool { 933 synced, _ = p.Locks()[ID1].IsSynced() 934 return synced 935 }, 3*time.Second, 100*time.Millisecond) 936 937 // wait exec operation for the owner become available. 938 opCh = make(chan pessimism.Operation, 10) 939 errCh = make(chan error, 10) 940 ctx3, cancel3 := context.WithTimeout(ctx, watchTimeout) 941 pessimism.WatchOperationPut(ctx3, t.etcdTestCli, task1, source1, rev1+1, opCh, errCh) 942 cancel3() 943 close(opCh) 944 close(errCh) 945 require.Equal(t.T(), 0, len(errCh)) 946 require.Equal(t.T(), 1, len(opCh)) 947 op11 := <-opCh 948 require.True(t.T(), op11.Exec) 949 require.False(t.T(), op11.Done) 950 951 // mark exec operation for the owner as `done` (and delete the info). 952 op11c := op11 953 op11c.Done = true 954 done, rev2, err := pessimism.PutOperationDeleteExistInfo(t.etcdTestCli, op11c, i11) 955 require.NoError(t.T(), err) 956 require.True(t.T(), done) 957 require.Eventually(t.T(), func() bool { 958 return p.Locks()[ID1].IsDone(source1) 959 }, 3*time.Second, 100*time.Millisecond) 960 961 // wait skip operation for the non-owner become available. 962 opCh = make(chan pessimism.Operation, 10) 963 errCh = make(chan error, 10) 964 ctx3, cancel3 = context.WithTimeout(ctx, watchTimeout) 965 pessimism.WatchOperationPut(ctx3, t.etcdTestCli, task1, source2, rev2+1, opCh, errCh) 966 cancel3() 967 close(opCh) 968 close(errCh) 969 require.Equal(t.T(), 0, len(errCh)) 970 require.Equal(t.T(), 1, len(opCh)) 971 op12 := <-opCh 972 require.False(t.T(), op12.Exec) 973 require.False(t.T(), op12.Done) 974 975 // mark skip operation for the non-owner as `done` (and delete the info). 976 // the lock should become resolved and deleted. 977 op12c := op12 978 op12c.Done = true 979 done, _, err = pessimism.PutOperationDeleteExistInfo(t.etcdTestCli, op12c, i12) 980 require.NoError(t.T(), err) 981 require.True(t.T(), done) 982 require.Eventually(t.T(), func() bool { 983 _, ok := p.Locks()[ID1] 984 return !ok 985 }, 5*time.Second, 100*time.Millisecond) 986 require.Len(t.T(), p.Locks(), 0) 987 988 cancel2() 989 wg.Wait() 990 } 991 } 992 993 func (t *testPessimistSuite) putDoneForSource( 994 ctx context.Context, task, source string, info pessimism.Info, exec bool, 995 watchRev int64, watchTimeout time.Duration, 996 ) { 997 t.T().Helper() 998 var ( 999 wg sync.WaitGroup 1000 opCh = make(chan pessimism.Operation, 10) 1001 errCh = make(chan error, 10) 1002 ctx2, cancel2 = context.WithTimeout(ctx, watchTimeout) 1003 doneErr error 1004 ) 1005 1006 wg.Add(2) 1007 go func() { 1008 defer wg.Done() 1009 pessimism.WatchOperationPut(ctx2, t.etcdTestCli, task, source, watchRev, opCh, errCh) 1010 close(opCh) 1011 close(errCh) 1012 }() 1013 go func() { 1014 defer func() { 1015 cancel2() 1016 wg.Done() 1017 }() 1018 select { 1019 case <-ctx2.Done(): 1020 doneErr = errors.New("wait for the operation of the source timeout") 1021 case op := <-opCh: 1022 // put `done` after received non-`done`. 1023 require.Equal(t.T(), exec, op.Exec) 1024 require.False(t.T(), op.Done) 1025 op.Done = true 1026 done, _, err := pessimism.PutOperationDeleteExistInfo(t.etcdTestCli, op, info) 1027 require.NoError(t.T(), err) 1028 require.True(t.T(), done) 1029 case err := <-errCh: 1030 doneErr = err 1031 } 1032 }() 1033 wg.Wait() 1034 require.NoError(t.T(), doneErr) 1035 } 1036 1037 func (t *testPessimistSuite) noLockExist(p *Pessimist) { 1038 t.T().Helper() 1039 require.Len(t.T(), p.Locks(), 0) 1040 ifm, _, err := pessimism.GetAllInfo(t.etcdTestCli) 1041 require.NoError(t.T(), err) 1042 require.Len(t.T(), ifm, 0) 1043 opm, _, err := pessimism.GetAllOperations(t.etcdTestCli) 1044 require.NoError(t.T(), err) 1045 require.Len(t.T(), opm, 0) 1046 }