github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/cdc/scheduler/internal/v3/replication/replication_set_test.go (about) 1 // Copyright 2022 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package replication 15 16 import ( 17 "container/heap" 18 "encoding/json" 19 "math/rand" 20 "testing" 21 "time" 22 23 "github.com/pingcap/tiflow/cdc/model" 24 "github.com/pingcap/tiflow/cdc/processor/tablepb" 25 "github.com/pingcap/tiflow/cdc/scheduler/schedulepb" 26 "github.com/pingcap/tiflow/pkg/spanz" 27 "github.com/stretchr/testify/require" 28 ) 29 30 // See https://stackoverflow.com/a/30230552/3920448 for details. 31 func nextPerm(p []int) { 32 for i := len(p) - 1; i >= 0; i-- { 33 if i == 0 || p[i] < len(p)-i-1 { 34 p[i]++ 35 return 36 } 37 p[i] = 0 38 } 39 } 40 41 func getPerm(orig, p []int) []int { 42 result := append([]int{}, orig...) 43 for i, v := range p { 44 result[i], result[i+v] = result[i+v], result[i] 45 } 46 return result 47 } 48 49 func iterPermutation(sequence []int, fn func(sequence []int)) { 50 for p := make([]int, len(sequence)); p[0] < len(p); nextPerm(p) { 51 fn(getPerm(sequence, p)) 52 } 53 } 54 55 func TestNewReplicationSet(t *testing.T) { 56 testcases := []struct { 57 set *ReplicationSet 58 checkpoint model.Ts 59 tableStatus map[model.CaptureID]*tablepb.TableStatus 60 }{ 61 { 62 set: &ReplicationSet{ 63 State: ReplicationSetStateAbsent, 64 Captures: map[string]Role{}, 65 Stats: tablepb.Stats{ 66 StageCheckpoints: map[string]tablepb.Checkpoint{ 67 "puller-egress": { 68 CheckpointTs: 0, 69 ResolvedTs: 0, 70 LastSyncedTs: 0, 71 }, 72 "puller-ingress": { 73 CheckpointTs: 0, 74 ResolvedTs: 0, 75 LastSyncedTs: 0, 76 }, 77 "sink": { 78 CheckpointTs: 0, 79 ResolvedTs: 0, 80 LastSyncedTs: 0, 81 }, 82 "sorter-ingress": { 83 CheckpointTs: 0, 84 ResolvedTs: 0, 85 LastSyncedTs: 0, 86 }, 87 "sorter-egress": { 88 CheckpointTs: 0, 89 ResolvedTs: 0, 90 LastSyncedTs: 0, 91 }, 92 }, 93 }, 94 }, 95 tableStatus: map[model.CaptureID]*tablepb.TableStatus{}, 96 }, 97 { 98 set: &ReplicationSet{ 99 Primary: "1", 100 State: ReplicationSetStateReplicating, 101 Captures: map[string]Role{"1": RolePrimary}, 102 Checkpoint: tablepb.Checkpoint{ 103 CheckpointTs: 2, ResolvedTs: 2, 104 }, 105 Stats: tablepb.Stats{ 106 StageCheckpoints: map[string]tablepb.Checkpoint{ 107 "puller-egress": { 108 CheckpointTs: 2, 109 ResolvedTs: 2, 110 LastSyncedTs: 0, 111 }, 112 "puller-ingress": { 113 CheckpointTs: 2, 114 ResolvedTs: 2, 115 LastSyncedTs: 0, 116 }, 117 "sink": { 118 CheckpointTs: 2, 119 ResolvedTs: 2, 120 LastSyncedTs: 0, 121 }, 122 "sorter-ingress": { 123 CheckpointTs: 2, 124 ResolvedTs: 2, 125 LastSyncedTs: 0, 126 }, 127 "sorter-egress": { 128 CheckpointTs: 2, 129 ResolvedTs: 2, 130 LastSyncedTs: 0, 131 }, 132 }, 133 }, 134 }, 135 checkpoint: 2, 136 tableStatus: map[model.CaptureID]*tablepb.TableStatus{ 137 "1": { 138 State: tablepb.TableStateReplicating, 139 Checkpoint: tablepb.Checkpoint{ 140 CheckpointTs: 1, ResolvedTs: 1, 141 }, 142 }, 143 }, 144 }, 145 { 146 // Rebuild add table state. 147 set: &ReplicationSet{ 148 State: ReplicationSetStatePrepare, 149 Captures: map[string]Role{"1": RoleSecondary}, 150 Stats: tablepb.Stats{ 151 StageCheckpoints: map[string]tablepb.Checkpoint{ 152 "puller-egress": { 153 CheckpointTs: 0, 154 ResolvedTs: 0, 155 LastSyncedTs: 0, 156 }, 157 "puller-ingress": { 158 CheckpointTs: 0, 159 ResolvedTs: 0, 160 LastSyncedTs: 0, 161 }, 162 "sink": { 163 CheckpointTs: 0, 164 ResolvedTs: 0, 165 LastSyncedTs: 0, 166 }, 167 "sorter-ingress": { 168 CheckpointTs: 0, 169 ResolvedTs: 0, 170 LastSyncedTs: 0, 171 }, 172 "sorter-egress": { 173 CheckpointTs: 0, 174 ResolvedTs: 0, 175 LastSyncedTs: 0, 176 }, 177 }, 178 }, 179 }, 180 tableStatus: map[model.CaptureID]*tablepb.TableStatus{ 181 "1": { 182 State: tablepb.TableStatePreparing, 183 Checkpoint: tablepb.Checkpoint{}, 184 }, 185 }, 186 }, 187 { 188 // Rebuild move table state, Prepare. 189 set: &ReplicationSet{ 190 State: ReplicationSetStatePrepare, 191 Primary: "2", 192 Captures: map[string]Role{ 193 "1": RoleSecondary, "2": RolePrimary, 194 }, 195 Checkpoint: tablepb.Checkpoint{ 196 CheckpointTs: 2, 197 ResolvedTs: 2, 198 }, 199 Stats: tablepb.Stats{ 200 StageCheckpoints: map[string]tablepb.Checkpoint{ 201 "puller-egress": { 202 CheckpointTs: 0, 203 ResolvedTs: 0, 204 LastSyncedTs: 0, 205 }, 206 "puller-ingress": { 207 CheckpointTs: 0, 208 ResolvedTs: 0, 209 LastSyncedTs: 0, 210 }, 211 "sink": { 212 CheckpointTs: 0, 213 ResolvedTs: 0, 214 LastSyncedTs: 0, 215 }, 216 "sorter-ingress": { 217 CheckpointTs: 0, 218 ResolvedTs: 0, 219 LastSyncedTs: 0, 220 }, 221 "sorter-egress": { 222 CheckpointTs: 0, 223 ResolvedTs: 0, 224 LastSyncedTs: 0, 225 }, 226 }, 227 }, 228 }, 229 tableStatus: map[model.CaptureID]*tablepb.TableStatus{ 230 "1": { 231 State: tablepb.TableStatePreparing, 232 Checkpoint: tablepb.Checkpoint{ 233 CheckpointTs: 1, 234 ResolvedTs: 1, 235 }, 236 }, 237 "2": { 238 State: tablepb.TableStateReplicating, 239 Checkpoint: tablepb.Checkpoint{ 240 CheckpointTs: 2, 241 ResolvedTs: 2, 242 }, 243 }, 244 }, 245 }, 246 { 247 // Rebuild move table state, Commit. 248 set: &ReplicationSet{ 249 State: ReplicationSetStateCommit, 250 Primary: "2", 251 Captures: map[string]Role{ 252 "1": RoleSecondary, "2": RolePrimary, 253 }, 254 Stats: tablepb.Stats{ 255 StageCheckpoints: map[string]tablepb.Checkpoint{ 256 "puller-egress": { 257 CheckpointTs: 0, 258 ResolvedTs: 0, 259 LastSyncedTs: 0, 260 }, 261 "puller-ingress": { 262 CheckpointTs: 0, 263 ResolvedTs: 0, 264 LastSyncedTs: 0, 265 }, 266 "sink": { 267 CheckpointTs: 0, 268 ResolvedTs: 0, 269 LastSyncedTs: 0, 270 }, 271 "sorter-ingress": { 272 CheckpointTs: 0, 273 ResolvedTs: 0, 274 LastSyncedTs: 0, 275 }, 276 "sorter-egress": { 277 CheckpointTs: 0, 278 ResolvedTs: 0, 279 LastSyncedTs: 0, 280 }, 281 }, 282 }, 283 }, 284 tableStatus: map[model.CaptureID]*tablepb.TableStatus{ 285 "1": { 286 State: tablepb.TableStatePrepared, 287 Checkpoint: tablepb.Checkpoint{}, 288 }, 289 "2": { 290 State: tablepb.TableStateReplicating, 291 Checkpoint: tablepb.Checkpoint{}, 292 }, 293 }, 294 }, 295 { 296 // Rebuild move table state, Commit, original primary stopping. 297 set: &ReplicationSet{ 298 State: ReplicationSetStateCommit, 299 Captures: map[string]Role{ 300 "1": RoleSecondary, "2": RoleUndetermined, 301 }, 302 Stats: tablepb.Stats{ 303 StageCheckpoints: map[string]tablepb.Checkpoint{ 304 "puller-egress": { 305 CheckpointTs: 0, 306 ResolvedTs: 0, 307 LastSyncedTs: 0, 308 }, 309 "puller-ingress": { 310 CheckpointTs: 0, 311 ResolvedTs: 0, 312 LastSyncedTs: 0, 313 }, 314 "sink": { 315 CheckpointTs: 0, 316 ResolvedTs: 0, 317 LastSyncedTs: 0, 318 }, 319 "sorter-ingress": { 320 CheckpointTs: 0, 321 ResolvedTs: 0, 322 LastSyncedTs: 0, 323 }, 324 "sorter-egress": { 325 CheckpointTs: 0, 326 ResolvedTs: 0, 327 LastSyncedTs: 0, 328 }, 329 }, 330 }, 331 }, 332 tableStatus: map[model.CaptureID]*tablepb.TableStatus{ 333 "1": { 334 State: tablepb.TableStatePrepared, 335 Checkpoint: tablepb.Checkpoint{}, 336 }, 337 "2": { 338 State: tablepb.TableStateStopping, 339 Checkpoint: tablepb.Checkpoint{}, 340 }, 341 }, 342 }, 343 { 344 // Rebuild move table state, Commit, original primary stopped. 345 set: &ReplicationSet{ 346 State: ReplicationSetStateCommit, 347 Captures: map[string]Role{"1": RoleSecondary}, 348 Stats: tablepb.Stats{ 349 StageCheckpoints: map[string]tablepb.Checkpoint{ 350 "puller-egress": { 351 CheckpointTs: 0, 352 ResolvedTs: 0, 353 LastSyncedTs: 0, 354 }, 355 "puller-ingress": { 356 CheckpointTs: 0, 357 ResolvedTs: 0, 358 LastSyncedTs: 0, 359 }, 360 "sink": { 361 CheckpointTs: 0, 362 ResolvedTs: 0, 363 LastSyncedTs: 0, 364 }, 365 "sorter-ingress": { 366 CheckpointTs: 0, 367 ResolvedTs: 0, 368 LastSyncedTs: 0, 369 }, 370 "sorter-egress": { 371 CheckpointTs: 0, 372 ResolvedTs: 0, 373 LastSyncedTs: 0, 374 }, 375 }, 376 }, 377 }, 378 tableStatus: map[model.CaptureID]*tablepb.TableStatus{ 379 "1": { 380 State: tablepb.TableStatePrepared, 381 Checkpoint: tablepb.Checkpoint{}, 382 }, 383 "2": { 384 State: tablepb.TableStateStopped, 385 Checkpoint: tablepb.Checkpoint{}, 386 }, 387 }, 388 }, 389 { 390 // Rebuild remove table state, Removing. 391 set: &ReplicationSet{ 392 State: ReplicationSetStateRemoving, 393 Captures: map[string]Role{ 394 "1": RoleUndetermined, "2": RoleUndetermined, 395 }, 396 Stats: tablepb.Stats{ 397 StageCheckpoints: map[string]tablepb.Checkpoint{ 398 "puller-egress": { 399 CheckpointTs: 0, 400 ResolvedTs: 0, 401 LastSyncedTs: 0, 402 }, 403 "puller-ingress": { 404 CheckpointTs: 0, 405 ResolvedTs: 0, 406 LastSyncedTs: 0, 407 }, 408 "sink": { 409 CheckpointTs: 0, 410 ResolvedTs: 0, 411 LastSyncedTs: 0, 412 }, 413 "sorter-ingress": { 414 CheckpointTs: 0, 415 ResolvedTs: 0, 416 LastSyncedTs: 0, 417 }, 418 "sorter-egress": { 419 CheckpointTs: 0, 420 ResolvedTs: 0, 421 LastSyncedTs: 0, 422 }, 423 }, 424 }, 425 }, 426 tableStatus: map[model.CaptureID]*tablepb.TableStatus{ 427 "1": { 428 State: tablepb.TableStateStopping, 429 Checkpoint: tablepb.Checkpoint{}, 430 }, 431 "2": { 432 State: tablepb.TableStateStopping, 433 Checkpoint: tablepb.Checkpoint{}, 434 }, 435 }, 436 }, 437 { 438 // Multiple primary error. 439 set: nil, 440 tableStatus: map[model.CaptureID]*tablepb.TableStatus{ 441 "1": { 442 State: tablepb.TableStateReplicating, 443 Checkpoint: tablepb.Checkpoint{}, 444 }, 445 "2": { 446 State: tablepb.TableStateReplicating, 447 Checkpoint: tablepb.Checkpoint{}, 448 }, 449 }, 450 }, 451 } 452 for id, tc := range testcases { 453 set := tc.set 454 status := tc.tableStatus 455 checkpoint := tc.checkpoint 456 457 span := tablepb.Span{TableID: 0} 458 output, err := NewReplicationSet(span, checkpoint, status, model.ChangeFeedID{}) 459 if set == nil { 460 require.Errorf(t, err, "%d", id) 461 } else { 462 require.Nilf(t, err, "%+v, %d", err, id) 463 require.EqualValuesf(t, set, output, "%d", id) 464 } 465 } 466 } 467 468 // Test all table states and replication states. 469 func TestReplicationSetPoll(t *testing.T) { 470 var testcases []map[string]tablepb.TableState 471 for state1 := range tablepb.TableState_name { 472 for state2 := range tablepb.TableState_name { 473 if state1 == state2 && state1 == int32(tablepb.TableStateReplicating) { 474 continue 475 } 476 tc := map[string]tablepb.TableState{ 477 "1": tablepb.TableState(state1), 478 "2": tablepb.TableState(state2), 479 } 480 testcases = append(testcases, tc) 481 } 482 } 483 seed := time.Now().Unix() 484 rnd := rand.New(rand.NewSource(seed)) 485 rnd.Shuffle(len(testcases), func(i, j int) { 486 testcases[i], testcases[j] = testcases[j], testcases[i] 487 }) 488 // It takes minutes to complete all test cases. 489 // To speed up, we only test the first 2 cases. 490 testcases = testcases[:2] 491 492 from := "1" 493 for _, states := range testcases { 494 status := make(map[string]*tablepb.TableStatus) 495 for id, state := range states { 496 status[id] = &tablepb.TableStatus{ 497 Span: tablepb.Span{TableID: 1}, 498 State: state, 499 Checkpoint: tablepb.Checkpoint{}, 500 } 501 } 502 span := tablepb.Span{TableID: 1} 503 r, _ := NewReplicationSet(span, 0, status, model.ChangeFeedID{}) 504 var tableStates []int 505 for state := range tablepb.TableState_name { 506 tableStates = append(tableStates, int(state)) 507 } 508 input := &tablepb.TableStatus{Span: tablepb.Span{TableID: model.TableID(1)}} 509 iterPermutation(tableStates, func(tableStateSequence []int) { 510 t.Logf("test %d, %v, %v", seed, status, tableStateSequence) 511 for _, state := range tableStateSequence { 512 input.State = tablepb.TableState(state) 513 msgs, _ := r.poll(input, from) 514 for i := range msgs { 515 if msgs[i] == nil { 516 t.Errorf("nil messages: %v, input: %v, from: %s, r: %v", 517 msgs, *input, from, *r) 518 } 519 } 520 // For now, poll() is expected to output at most one message. 521 if len(msgs) > 1 { 522 t.Errorf("too many messages: %v, input: %v, from: %s, r: %v", 523 msgs, *input, from, *r) 524 } 525 } 526 }) 527 } 528 } 529 530 func TestReplicationSetPollUnknownCapture(t *testing.T) { 531 t.Parallel() 532 533 tableID := model.TableID(1) 534 span := tablepb.Span{TableID: tableID} 535 r, err := NewReplicationSet(span, 0, map[model.CaptureID]*tablepb.TableStatus{ 536 "1": { 537 Span: tablepb.Span{TableID: tableID}, 538 State: tablepb.TableStateReplicating, 539 Checkpoint: tablepb.Checkpoint{}, 540 }, 541 }, model.ChangeFeedID{}) 542 require.Nil(t, err) 543 544 msgs, err := r.poll(&tablepb.TableStatus{ 545 Span: tablepb.Span{TableID: tableID}, 546 State: tablepb.TableStateReplicating, 547 }, "unknown") 548 require.Nil(t, msgs) 549 require.Nil(t, err) 550 551 msgs, err = r.poll(&tablepb.TableStatus{ 552 Span: tablepb.Span{TableID: tableID}, 553 State: tablepb.TableStateAbsent, 554 }, "unknown") 555 require.Len(t, msgs, 0) 556 require.Nil(t, err) 557 558 msgs, err = r.poll(&tablepb.TableStatus{ 559 Span: tablepb.Span{TableID: tableID}, 560 State: tablepb.TableStateReplicating, 561 }, "unknown") 562 require.Len(t, msgs, 0) 563 require.Nil(t, err) 564 } 565 566 func TestReplicationSetAddTable(t *testing.T) { 567 t.Parallel() 568 569 from := "1" 570 tableID := model.TableID(1) 571 span := tablepb.Span{TableID: tableID} 572 r, err := NewReplicationSet(span, 0, nil, model.ChangeFeedID{}) 573 require.Nil(t, err) 574 575 // Absent -> Prepare 576 msgs, err := r.handleAddTable(from) 577 require.Nil(t, err) 578 require.Len(t, msgs, 1) 579 require.EqualValues(t, &schedulepb.Message{ 580 To: from, 581 MsgType: schedulepb.MsgDispatchTableRequest, 582 DispatchTableRequest: &schedulepb.DispatchTableRequest{ 583 Request: &schedulepb.DispatchTableRequest_AddTable{ 584 AddTable: &schedulepb.AddTableRequest{ 585 Span: tablepb.Span{TableID: r.Span.TableID}, 586 IsSecondary: true, 587 Checkpoint: r.Checkpoint, 588 }, 589 }, 590 }, 591 }, msgs[0]) 592 require.Equal(t, ReplicationSetStatePrepare, r.State) 593 require.True(t, r.isInRole(from, RoleSecondary)) 594 595 // No-op if add table again. 596 msgs, err = r.handleAddTable(from) 597 require.Nil(t, err) 598 require.Len(t, msgs, 0) 599 600 // AddTableRequest is lost somehow, send AddTableRequest again. 601 msgs, err = r.handleTableStatus(from, &tablepb.TableStatus{ 602 Span: tablepb.Span{TableID: tableID}, 603 State: tablepb.TableStateAbsent, 604 }) 605 require.Nil(t, err) 606 require.Len(t, msgs, 1) 607 require.EqualValues(t, &schedulepb.Message{ 608 To: from, 609 MsgType: schedulepb.MsgDispatchTableRequest, 610 DispatchTableRequest: &schedulepb.DispatchTableRequest{ 611 Request: &schedulepb.DispatchTableRequest_AddTable{ 612 AddTable: &schedulepb.AddTableRequest{ 613 Span: tablepb.Span{TableID: r.Span.TableID}, 614 IsSecondary: true, 615 Checkpoint: r.Checkpoint, 616 }, 617 }, 618 }, 619 }, msgs[0]) 620 require.Equal(t, ReplicationSetStatePrepare, r.State) 621 require.True(t, r.isInRole(from, RoleSecondary)) 622 623 // Prepare is in-progress. 624 msgs, err = r.handleTableStatus(from, &tablepb.TableStatus{ 625 Span: tablepb.Span{TableID: tableID}, 626 State: tablepb.TableStatePreparing, 627 }) 628 require.Nil(t, err) 629 require.Len(t, msgs, 0) 630 require.Equal(t, ReplicationSetStatePrepare, r.State) 631 require.True(t, r.isInRole(from, RoleSecondary)) 632 633 // Prepare -> Commit. 634 msgs, err = r.handleTableStatus(from, &tablepb.TableStatus{ 635 Span: tablepb.Span{TableID: tableID}, 636 State: tablepb.TableStatePrepared, 637 }) 638 require.Nil(t, err) 639 require.Len(t, msgs, 1) 640 require.EqualValues(t, &schedulepb.Message{ 641 To: from, 642 MsgType: schedulepb.MsgDispatchTableRequest, 643 DispatchTableRequest: &schedulepb.DispatchTableRequest{ 644 Request: &schedulepb.DispatchTableRequest_AddTable{ 645 AddTable: &schedulepb.AddTableRequest{ 646 Span: tablepb.Span{TableID: r.Span.TableID}, 647 IsSecondary: false, 648 Checkpoint: r.Checkpoint, 649 }, 650 }, 651 }, 652 }, msgs[0]) 653 require.Equal(t, ReplicationSetStateCommit, r.State) 654 require.Equal(t, from, r.Primary) 655 require.False(t, r.hasRole(RoleSecondary)) 656 // The secondary AddTable request may be lost. 657 msgs, err = r.handleTableStatus(from, &tablepb.TableStatus{ 658 Span: tablepb.Span{TableID: tableID}, 659 State: tablepb.TableStatePrepared, 660 }) 661 require.Nil(t, err) 662 require.Len(t, msgs, 1) 663 require.EqualValues(t, &schedulepb.Message{ 664 To: from, 665 MsgType: schedulepb.MsgDispatchTableRequest, 666 DispatchTableRequest: &schedulepb.DispatchTableRequest{ 667 Request: &schedulepb.DispatchTableRequest_AddTable{ 668 AddTable: &schedulepb.AddTableRequest{ 669 Span: tablepb.Span{TableID: r.Span.TableID}, 670 IsSecondary: false, 671 Checkpoint: r.Checkpoint, 672 }, 673 }, 674 }, 675 }, msgs[0]) 676 require.Equal(t, ReplicationSetStateCommit, r.State) 677 require.Equal(t, from, r.Primary) 678 require.False(t, r.hasRole(RoleSecondary)) 679 680 // Commit -> Replicating 681 msgs, err = r.handleTableStatus(from, &tablepb.TableStatus{ 682 Span: tablepb.Span{TableID: tableID}, 683 State: tablepb.TableStateReplicating, 684 }) 685 require.Nil(t, err) 686 require.Len(t, msgs, 0) 687 require.Equal(t, ReplicationSetStateReplicating, r.State) 688 require.Equal(t, from, r.Primary) 689 require.False(t, r.hasRole(RoleSecondary)) 690 691 // Replicating -> Replicating 692 msgs, err = r.handleTableStatus(from, &tablepb.TableStatus{ 693 Span: tablepb.Span{TableID: tableID}, 694 State: tablepb.TableStateReplicating, 695 Checkpoint: tablepb.Checkpoint{ 696 CheckpointTs: 3, 697 ResolvedTs: 4, 698 }, 699 }) 700 require.Nil(t, err) 701 require.Len(t, msgs, 0) 702 require.Equal(t, ReplicationSetStateReplicating, r.State) 703 require.Equal(t, from, r.Primary) 704 require.False(t, r.hasRole(RoleSecondary)) 705 require.Equal(t, tablepb.Checkpoint{ 706 CheckpointTs: 3, 707 ResolvedTs: 4, 708 }, r.Checkpoint) 709 } 710 711 func TestReplicationSetRemoveTable(t *testing.T) { 712 t.Parallel() 713 714 from := "1" 715 tableID := model.TableID(1) 716 span := tablepb.Span{TableID: tableID} 717 r, err := NewReplicationSet(span, 0, nil, model.ChangeFeedID{}) 718 require.Nil(t, err) 719 720 // Ignore removing table if it's not in replicating. 721 msgs, err := r.handleRemoveTable() 722 require.Nil(t, err) 723 require.Len(t, msgs, 0) 724 require.False(t, r.hasRemoved()) 725 726 // Replicating -> Removing 727 r.State = ReplicationSetStateReplicating 728 require.Nil(t, r.setCapture(from, RoleSecondary)) 729 require.Nil(t, r.promoteSecondary(from)) 730 msgs, err = r.handleRemoveTable() 731 require.Nil(t, err) 732 require.Len(t, msgs, 1) 733 require.EqualValues(t, &schedulepb.Message{ 734 To: from, 735 MsgType: schedulepb.MsgDispatchTableRequest, 736 DispatchTableRequest: &schedulepb.DispatchTableRequest{ 737 Request: &schedulepb.DispatchTableRequest_RemoveTable{ 738 RemoveTable: &schedulepb.RemoveTableRequest{ 739 Span: tablepb.Span{TableID: r.Span.TableID}, 740 }, 741 }, 742 }, 743 }, msgs[0]) 744 require.Equal(t, ReplicationSetStateRemoving, r.State) 745 require.False(t, r.hasRemoved()) 746 747 // Ignore remove table if it's in-progress. 748 msgs, err = r.handleRemoveTable() 749 require.Nil(t, err) 750 require.Len(t, msgs, 0) 751 752 // Removing is in-progress. 753 msgs, err = r.handleTableStatus(from, &tablepb.TableStatus{ 754 Span: tablepb.Span{TableID: tableID}, 755 State: tablepb.TableStateStopping, 756 }) 757 require.Nil(t, err) 758 require.Len(t, msgs, 0) 759 require.Equal(t, ReplicationSetStateRemoving, r.State) 760 require.False(t, r.hasRemoved()) 761 762 // Removed if the table is absent. 763 rClone := clone(r) 764 msgs, err = rClone.handleTableStatus(from, &tablepb.TableStatus{ 765 Span: tablepb.Span{TableID: tableID}, 766 State: tablepb.TableStateStopped, 767 }) 768 require.Nil(t, err) 769 require.Len(t, msgs, 0) 770 require.Equal(t, ReplicationSetStateRemoving, rClone.State) 771 require.True(t, rClone.hasRemoved()) 772 773 // Removed if the table is stopped. 774 msgs, err = r.handleTableStatus(from, &tablepb.TableStatus{ 775 Span: tablepb.Span{TableID: tableID}, 776 State: tablepb.TableStateStopped, 777 }) 778 require.Nil(t, err) 779 require.Len(t, msgs, 0) 780 require.Equal(t, ReplicationSetStateRemoving, r.State) 781 require.True(t, r.hasRemoved()) 782 } 783 784 func clone(r *ReplicationSet) *ReplicationSet { 785 rClone := *r 786 rClone.Captures = make(map[string]Role) 787 for captureID, role := range r.Captures { 788 rClone.Captures[captureID] = role 789 } 790 return &rClone 791 } 792 793 func TestReplicationSetMoveTable(t *testing.T) { 794 t.Parallel() 795 796 tableID := model.TableID(1) 797 span := tablepb.Span{TableID: tableID} 798 r, err := NewReplicationSet(span, 0, nil, model.ChangeFeedID{}) 799 require.Nil(t, err) 800 801 source := "1" 802 dest := "2" 803 // Ignore removing table if it's not in replicating. 804 r.State = ReplicationSetStatePrepare 805 require.Nil(t, r.setCapture(source, RoleSecondary)) 806 msgs, err := r.handleMoveTable(dest) 807 require.Nil(t, err) 808 require.Len(t, msgs, 0) 809 require.NotContains(t, r.Captures, dest) 810 811 r.State = ReplicationSetStateReplicating 812 require.Nil(t, r.promoteSecondary(source)) 813 814 // Replicating -> Prepare 815 msgs, err = r.handleMoveTable(dest) 816 require.Nil(t, err) 817 require.Len(t, msgs, 1) 818 require.EqualValues(t, &schedulepb.Message{ 819 To: dest, 820 MsgType: schedulepb.MsgDispatchTableRequest, 821 DispatchTableRequest: &schedulepb.DispatchTableRequest{ 822 Request: &schedulepb.DispatchTableRequest_AddTable{ 823 AddTable: &schedulepb.AddTableRequest{ 824 Span: tablepb.Span{TableID: r.Span.TableID}, 825 IsSecondary: true, 826 Checkpoint: r.Checkpoint, 827 }, 828 }, 829 }, 830 }, msgs[0]) 831 require.Equal(t, ReplicationSetStatePrepare, r.State) 832 require.True(t, r.isInRole(dest, RoleSecondary)) 833 require.Equal(t, source, r.Primary) 834 835 // No-op if add table again. 836 msgs, err = r.handleAddTable(dest) 837 require.Nil(t, err) 838 require.Len(t, msgs, 0) 839 840 // Source primary sends heartbeat response 841 msgs, err = r.handleTableStatus(source, &tablepb.TableStatus{ 842 Span: tablepb.Span{TableID: tableID}, 843 State: tablepb.TableStateReplicating, 844 Checkpoint: tablepb.Checkpoint{ 845 CheckpointTs: 1, 846 ResolvedTs: 1, 847 }, 848 }) 849 require.Nil(t, err) 850 require.Len(t, msgs, 0) 851 require.Equal(t, tablepb.Checkpoint{ 852 CheckpointTs: 1, 853 ResolvedTs: 1, 854 }, r.Checkpoint) 855 856 // AddTableRequest is lost somehow, send AddTableRequest again. 857 msgs, err = r.handleTableStatus(dest, &tablepb.TableStatus{ 858 Span: tablepb.Span{TableID: tableID}, 859 State: tablepb.TableStateAbsent, 860 }) 861 require.Nil(t, err) 862 require.Len(t, msgs, 1) 863 require.EqualValues(t, &schedulepb.Message{ 864 To: dest, 865 MsgType: schedulepb.MsgDispatchTableRequest, 866 DispatchTableRequest: &schedulepb.DispatchTableRequest{ 867 Request: &schedulepb.DispatchTableRequest_AddTable{ 868 AddTable: &schedulepb.AddTableRequest{ 869 Span: tablepb.Span{TableID: r.Span.TableID}, 870 IsSecondary: true, 871 Checkpoint: r.Checkpoint, 872 }, 873 }, 874 }, 875 }, msgs[0]) 876 require.Equal(t, ReplicationSetStatePrepare, r.State) 877 require.True(t, r.isInRole(dest, RoleSecondary)) 878 879 // Prepare -> Commit. 880 msgs, err = r.handleTableStatus(dest, &tablepb.TableStatus{ 881 Span: tablepb.Span{TableID: tableID}, 882 State: tablepb.TableStatePrepared, 883 }) 884 require.Nil(t, err) 885 require.Len(t, msgs, 1) 886 require.EqualValues(t, &schedulepb.Message{ 887 To: source, 888 MsgType: schedulepb.MsgDispatchTableRequest, 889 DispatchTableRequest: &schedulepb.DispatchTableRequest{ 890 Request: &schedulepb.DispatchTableRequest_RemoveTable{ 891 RemoveTable: &schedulepb.RemoveTableRequest{Span: r.Span}, 892 }, 893 }, 894 }, msgs[0]) 895 require.Equal(t, ReplicationSetStateCommit, r.State) 896 require.Equal(t, source, r.Primary) 897 require.True(t, r.isInRole(dest, RoleSecondary)) 898 899 // Source updates it's table status 900 msgs, err = r.handleTableStatus(source, &tablepb.TableStatus{ 901 Span: tablepb.Span{TableID: tableID}, 902 State: tablepb.TableStateReplicating, 903 Checkpoint: tablepb.Checkpoint{ 904 CheckpointTs: 2, 905 ResolvedTs: 3, 906 }, 907 }) 908 require.Nil(t, err) 909 require.Len(t, msgs, 1, "%v", r) 910 require.EqualValues(t, &schedulepb.Message{ 911 To: source, 912 MsgType: schedulepb.MsgDispatchTableRequest, 913 DispatchTableRequest: &schedulepb.DispatchTableRequest{ 914 Request: &schedulepb.DispatchTableRequest_RemoveTable{ 915 RemoveTable: &schedulepb.RemoveTableRequest{Span: r.Span}, 916 }, 917 }, 918 }, msgs[0]) 919 require.Equal(t, ReplicationSetStateCommit, r.State) 920 require.Equal(t, source, r.Primary) 921 require.True(t, r.isInRole(dest, RoleSecondary)) 922 require.Equal(t, tablepb.Checkpoint{ 923 CheckpointTs: 2, 924 ResolvedTs: 3, 925 }, r.Checkpoint) 926 927 // Removing source is in-progress. 928 msgs, err = r.handleTableStatus(source, &tablepb.TableStatus{ 929 Span: tablepb.Span{TableID: tableID}, 930 State: tablepb.TableStateStopping, 931 Checkpoint: tablepb.Checkpoint{ 932 CheckpointTs: 3, 933 ResolvedTs: 3, 934 }, 935 }) 936 require.Nil(t, err) 937 require.Len(t, msgs, 0) 938 require.Equal(t, ReplicationSetStateCommit, r.State) 939 require.Equal(t, source, r.Primary) 940 require.True(t, r.isInRole(dest, RoleSecondary)) 941 require.Equal(t, tablepb.Checkpoint{ 942 CheckpointTs: 3, 943 ResolvedTs: 3, 944 }, r.Checkpoint) 945 946 // Source is removed. 947 rClone := clone(r) 948 msgs, err = r.handleTableStatus(source, &tablepb.TableStatus{ 949 Span: tablepb.Span{TableID: tableID}, 950 State: tablepb.TableStateStopped, 951 Checkpoint: tablepb.Checkpoint{ 952 CheckpointTs: 3, 953 ResolvedTs: 4, 954 }, 955 }) 956 require.Nil(t, err) 957 require.Len(t, msgs, 1) 958 require.EqualValues(t, &schedulepb.Message{ 959 To: dest, 960 MsgType: schedulepb.MsgDispatchTableRequest, 961 DispatchTableRequest: &schedulepb.DispatchTableRequest{ 962 Request: &schedulepb.DispatchTableRequest_AddTable{ 963 AddTable: &schedulepb.AddTableRequest{ 964 Span: tablepb.Span{TableID: r.Span.TableID}, 965 IsSecondary: false, 966 Checkpoint: r.Checkpoint, 967 }, 968 }, 969 }, 970 }, msgs[0]) 971 require.Equal(t, ReplicationSetStateCommit, r.State) 972 require.Equal(t, dest, r.Primary) 973 require.False(t, r.hasRole(RoleSecondary)) 974 require.Equal(t, tablepb.Checkpoint{ 975 CheckpointTs: 3, 976 ResolvedTs: 4, 977 }, r.Checkpoint) 978 979 // Source stopped message is lost somehow. 980 // rClone has checkpoint ts 3, resolved ts 3 981 msgs, err = rClone.handleTableStatus(source, &tablepb.TableStatus{ 982 Span: tablepb.Span{TableID: tableID}, 983 State: tablepb.TableStateAbsent, 984 }) 985 require.Nil(t, err) 986 require.Len(t, msgs, 1) 987 require.EqualValues(t, &schedulepb.Message{ 988 To: dest, 989 MsgType: schedulepb.MsgDispatchTableRequest, 990 DispatchTableRequest: &schedulepb.DispatchTableRequest{ 991 Request: &schedulepb.DispatchTableRequest_AddTable{ 992 AddTable: &schedulepb.AddTableRequest{ 993 Span: tablepb.Span{TableID: r.Span.TableID}, 994 IsSecondary: false, 995 Checkpoint: tablepb.Checkpoint{ 996 CheckpointTs: 3, 997 ResolvedTs: 3, 998 }, 999 }, 1000 }, 1001 }, 1002 }, msgs[0]) 1003 require.Equal(t, ReplicationSetStateCommit, rClone.State) 1004 require.Equal(t, dest, rClone.Primary) 1005 require.False(t, rClone.hasRole(RoleSecondary)) 1006 require.Equal(t, tablepb.Checkpoint{ 1007 CheckpointTs: 3, 1008 ResolvedTs: 3, 1009 }, rClone.Checkpoint) 1010 1011 // Commit -> Replicating 1012 msgs, err = r.handleTableStatus(dest, &tablepb.TableStatus{ 1013 Span: tablepb.Span{TableID: tableID}, 1014 State: tablepb.TableStateReplicating, 1015 }) 1016 require.Nil(t, err) 1017 require.Len(t, msgs, 0) 1018 require.Equal(t, ReplicationSetStateReplicating, r.State) 1019 require.Equal(t, dest, r.Primary) 1020 require.False(t, r.hasRole(RoleSecondary)) 1021 } 1022 1023 //nolint:tparallel 1024 func TestReplicationSetCaptureShutdown(t *testing.T) { 1025 t.Parallel() 1026 1027 from := "1" 1028 tableID := model.TableID(1) 1029 span := tablepb.Span{TableID: tableID} 1030 r, err := NewReplicationSet(span, 0, nil, model.ChangeFeedID{}) 1031 require.Nil(t, err) 1032 1033 // Add table, Absent -> Prepare 1034 msgs, err := r.handleAddTable(from) 1035 require.Nil(t, err) 1036 require.Len(t, msgs, 1) 1037 require.EqualValues(t, &schedulepb.Message{ 1038 To: from, 1039 MsgType: schedulepb.MsgDispatchTableRequest, 1040 DispatchTableRequest: &schedulepb.DispatchTableRequest{ 1041 Request: &schedulepb.DispatchTableRequest_AddTable{ 1042 AddTable: &schedulepb.AddTableRequest{ 1043 Span: tablepb.Span{TableID: r.Span.TableID}, 1044 IsSecondary: true, 1045 Checkpoint: r.Checkpoint, 1046 }, 1047 }, 1048 }, 1049 }, msgs[0]) 1050 require.Equal(t, ReplicationSetStatePrepare, r.State) 1051 require.True(t, r.isInRole(from, RoleSecondary)) 1052 1053 affected := false 1054 // Secondary shutdown during Prepare, Prepare -> Absent 1055 t.Run("AddTableSecondaryShutdownDuringPrepare", func(t *testing.T) { 1056 rClone := clone(r) 1057 msgs, affected, err = rClone.handleCaptureShutdown(from) 1058 require.Nil(t, err) 1059 require.True(t, affected) 1060 require.Len(t, msgs, 0) 1061 require.Empty(t, rClone.Captures) 1062 require.Equal(t, "", rClone.Primary) 1063 require.False(t, rClone.hasRole(RoleSecondary)) 1064 require.Equal(t, ReplicationSetStateAbsent, rClone.State) 1065 }) 1066 1067 // Add table, Prepare -> Commit 1068 msgs, err = r.handleTableStatus(from, &tablepb.TableStatus{ 1069 Span: tablepb.Span{TableID: tableID}, 1070 State: tablepb.TableStatePrepared, 1071 }) 1072 require.Nil(t, err) 1073 require.Len(t, msgs, 1) 1074 require.Equal(t, ReplicationSetStateCommit, r.State) 1075 require.Equal(t, from, r.Primary) 1076 require.False(t, r.hasRole(RoleSecondary)) 1077 1078 // Secondary shutdown during Commit, Commit -> Absent 1079 t.Run("AddTableSecondaryShutdownDuringCommit", func(t *testing.T) { 1080 rClone := clone(r) 1081 msgs, affected, err = rClone.handleCaptureShutdown(from) 1082 require.Nil(t, err) 1083 require.True(t, affected) 1084 require.Len(t, msgs, 0) 1085 require.Empty(t, rClone.Captures) 1086 require.Equal(t, "", rClone.Primary) 1087 require.False(t, rClone.hasRole(RoleSecondary)) 1088 require.Equal(t, ReplicationSetStateAbsent, rClone.State) 1089 }) 1090 1091 // Add table, Commit -> Replicating 1092 msgs, err = r.handleTableStatus(from, &tablepb.TableStatus{ 1093 Span: tablepb.Span{TableID: tableID}, 1094 State: tablepb.TableStateReplicating, 1095 }) 1096 require.Nil(t, err) 1097 require.Len(t, msgs, 0) 1098 require.Equal(t, ReplicationSetStateReplicating, r.State) 1099 require.Equal(t, from, r.Primary) 1100 require.False(t, r.hasRole(RoleSecondary)) 1101 1102 // Primary shutdown during Replicating, Replicating -> Absent 1103 t.Run("AddTablePrimaryShutdownDuringReplicating", func(t *testing.T) { 1104 rClone := clone(r) 1105 msgs, affected, err = rClone.handleCaptureShutdown(from) 1106 require.Nil(t, err) 1107 require.True(t, affected) 1108 require.Len(t, msgs, 0) 1109 require.Empty(t, rClone.Captures) 1110 require.Equal(t, "", rClone.Primary) 1111 require.False(t, rClone.hasRole(RoleSecondary)) 1112 require.Equal(t, ReplicationSetStateAbsent, rClone.State) 1113 }) 1114 1115 // Move table, Replicating -> Prepare 1116 dest := "2" 1117 msgs, err = r.handleMoveTable(dest) 1118 require.Nil(t, err) 1119 require.Len(t, msgs, 1) 1120 require.Equal(t, ReplicationSetStatePrepare, r.State) 1121 require.True(t, r.isInRole(dest, RoleSecondary)) 1122 1123 // Primary shutdown during Prepare, Prepare -> Prepare 1124 t.Run("MoveTablePrimaryShutdownDuringPrepare", func(t *testing.T) { 1125 rClone := clone(r) 1126 msgs, affected, err = rClone.handleCaptureShutdown(rClone.Primary) 1127 require.Nil(t, err) 1128 require.True(t, affected) 1129 require.Len(t, msgs, 0) 1130 require.EqualValues(t, map[string]Role{dest: RoleSecondary}, rClone.Captures) 1131 require.Equal(t, "", rClone.Primary) 1132 require.True(t, rClone.isInRole(dest, RoleSecondary)) 1133 require.Equal(t, ReplicationSetStatePrepare, rClone.State) 1134 // Secondary shutdown after primary shutdown, Prepare -> Absent 1135 secondary, ok := rClone.getRole(RoleSecondary) 1136 require.True(t, ok) 1137 msgs, affected, err = rClone.handleCaptureShutdown(secondary) 1138 require.Nil(t, err) 1139 require.True(t, affected) 1140 require.Len(t, msgs, 0) 1141 require.Empty(t, rClone.Captures) 1142 require.Equal(t, "", rClone.Primary) 1143 require.False(t, rClone.hasRole(RoleSecondary)) 1144 require.Equal(t, ReplicationSetStateAbsent, rClone.State) 1145 }) 1146 // Primary shutdown during Prepare, Prepare -> Prepare 1147 t.Run("MoveTableSecondaryShutdownDuringPrepare", func(t *testing.T) { 1148 rClone := clone(r) 1149 secondary, ok := rClone.getRole(RoleSecondary) 1150 require.True(t, ok) 1151 msgs, affected, err = rClone.handleCaptureShutdown(secondary) 1152 require.Nil(t, err) 1153 require.True(t, affected) 1154 require.Len(t, msgs, 0) 1155 require.EqualValues(t, map[string]Role{from: RolePrimary}, rClone.Captures) 1156 require.Equal(t, from, rClone.Primary) 1157 require.False(t, rClone.hasRole(RoleSecondary)) 1158 require.Equal(t, ReplicationSetStateReplicating, rClone.State) 1159 }) 1160 1161 // Move table, Prepare -> Commit 1162 msgs, err = r.handleTableStatus(dest, &tablepb.TableStatus{ 1163 Span: tablepb.Span{TableID: tableID}, 1164 State: tablepb.TableStatePrepared, 1165 }) 1166 require.Nil(t, err) 1167 require.Len(t, msgs, 1) 1168 require.Equal(t, ReplicationSetStateCommit, r.State) 1169 require.Equal(t, from, r.Primary) 1170 require.True(t, r.isInRole(dest, RoleSecondary)) 1171 1172 // Original primary shutdown during Commit, Commit -> Commit 1173 t.Run("MoveTableOriginalPrimaryShutdownDuringCommit", func(t *testing.T) { 1174 rClone := clone(r) 1175 msgs, affected, err = rClone.handleCaptureShutdown(rClone.Primary) 1176 require.Nil(t, err) 1177 require.True(t, affected) 1178 require.Len(t, msgs, 1) 1179 require.EqualValues(t, &schedulepb.Message{ 1180 To: dest, 1181 MsgType: schedulepb.MsgDispatchTableRequest, 1182 DispatchTableRequest: &schedulepb.DispatchTableRequest{ 1183 Request: &schedulepb.DispatchTableRequest_AddTable{ 1184 AddTable: &schedulepb.AddTableRequest{ 1185 Span: tablepb.Span{TableID: r.Span.TableID}, 1186 IsSecondary: false, 1187 Checkpoint: r.Checkpoint, 1188 }, 1189 }, 1190 }, 1191 }, msgs[0]) 1192 require.EqualValues(t, map[string]Role{dest: RolePrimary}, rClone.Captures) 1193 require.Equal(t, dest, rClone.Primary) 1194 require.False(t, rClone.hasRole(RoleSecondary)) 1195 require.Equal(t, ReplicationSetStateCommit, rClone.State) 1196 // New primary shutdown after original primary shutdown, Commit -> Absent 1197 msgs, affected, err = rClone.handleCaptureShutdown(dest) 1198 require.Nil(t, err) 1199 require.True(t, affected) 1200 require.Len(t, msgs, 0) 1201 require.Empty(t, rClone.Captures) 1202 require.Equal(t, "", rClone.Primary) 1203 require.False(t, rClone.hasRole(RoleSecondary)) 1204 require.Equal(t, ReplicationSetStateAbsent, rClone.State) 1205 }) 1206 1207 // Secondary shutdown during Commit, Commit -> Commit 1208 t.Run("MoveTableSecondaryShutdownDuringCommit", func(t *testing.T) { 1209 rClone := clone(r) 1210 secondary, ok := rClone.getRole(RoleSecondary) 1211 require.True(t, ok) 1212 msgs, affected, err = rClone.handleCaptureShutdown(secondary) 1213 require.Nil(t, err) 1214 require.True(t, affected) 1215 require.Len(t, msgs, 0) 1216 require.EqualValues(t, map[string]Role{from: RolePrimary}, rClone.Captures) 1217 require.Equal(t, from, rClone.Primary) 1218 require.False(t, rClone.hasRole(RoleSecondary)) 1219 require.Equal(t, ReplicationSetStateCommit, rClone.State) 1220 1221 // Original primary is still replicating, Commit -> Replicating 1222 t.Run("OriginalPrimaryReplicating", func(t *testing.T) { 1223 rClone1 := clone(rClone) 1224 msgs, err = rClone1.handleTableStatus(rClone1.Primary, &tablepb.TableStatus{ 1225 Span: tablepb.Span{TableID: 1}, 1226 State: tablepb.TableStateReplicating, 1227 }) 1228 require.Nil(t, err) 1229 require.Len(t, msgs, 0) 1230 require.EqualValues( 1231 t, map[string]Role{from: RolePrimary}, rClone1.Captures) 1232 require.Equal(t, from, rClone1.Primary) 1233 require.False(t, rClone1.hasRole(RoleSecondary)) 1234 require.Equal(t, ReplicationSetStateReplicating, rClone1.State) 1235 }) 1236 1237 // Original primary is stopped, Commit -> Absent 1238 t.Run("OriginalPrimaryStopped", func(t *testing.T) { 1239 rClone1 := clone(rClone) 1240 msgs, err = rClone1.handleTableStatus(rClone1.Primary, &tablepb.TableStatus{ 1241 Span: tablepb.Span{TableID: 1}, 1242 State: tablepb.TableStateStopped, 1243 }) 1244 require.Nil(t, err) 1245 require.Len(t, msgs, 0) 1246 require.Empty(t, rClone1.Captures) 1247 require.Equal(t, "", rClone1.Primary) 1248 require.False(t, rClone1.hasRole(RoleSecondary)) 1249 require.Equal(t, ReplicationSetStateAbsent, rClone1.State) 1250 }) 1251 1252 // Original primary is absent, Commit -> Absent, 1253 // and then add the original primary back, Absent -> Prepare 1254 t.Run("OriginalPrimaryAbsent", func(t *testing.T) { 1255 rClone1 := clone(rClone) 1256 msgs, err = rClone1.handleTableStatus(rClone1.Primary, &tablepb.TableStatus{ 1257 Span: tablepb.Span{TableID: 1}, 1258 State: tablepb.TableStateAbsent, 1259 }) 1260 require.Nil(t, err) 1261 require.Len(t, msgs, 1) 1262 require.EqualValues(t, &schedulepb.Message{ 1263 To: from, 1264 MsgType: schedulepb.MsgDispatchTableRequest, 1265 DispatchTableRequest: &schedulepb.DispatchTableRequest{ 1266 Request: &schedulepb.DispatchTableRequest_AddTable{ 1267 AddTable: &schedulepb.AddTableRequest{ 1268 Span: tablepb.Span{TableID: r.Span.TableID}, 1269 IsSecondary: true, 1270 Checkpoint: r.Checkpoint, 1271 }, 1272 }, 1273 }, 1274 }, msgs[0]) 1275 require.Contains(t, rClone1.Captures, from) 1276 require.Equal(t, "", rClone1.Primary) 1277 require.True(t, rClone1.isInRole(from, RoleSecondary)) 1278 require.Equal(t, ReplicationSetStatePrepare, rClone1.State) 1279 }) 1280 }) 1281 1282 // Move table, original primary is stopped. 1283 msgs, err = r.handleTableStatus(from, &tablepb.TableStatus{ 1284 Span: tablepb.Span{TableID: tableID}, 1285 State: tablepb.TableStateStopped, 1286 }) 1287 require.Nil(t, err) 1288 require.Len(t, msgs, 1) 1289 require.Equal(t, ReplicationSetStateCommit, r.State) 1290 require.Equal(t, dest, r.Primary) 1291 require.False(t, r.hasRole(RoleSecondary)) 1292 t.Run("MoveTableNewPrimaryShutdownDuringCommit", func(t *testing.T) { 1293 rClone := clone(r) 1294 msgs, affected, err = rClone.handleCaptureShutdown(rClone.Primary) 1295 require.Nil(t, err) 1296 require.True(t, affected) 1297 require.Len(t, msgs, 0) 1298 require.Empty(t, rClone.Captures) 1299 require.Equal(t, "", rClone.Primary) 1300 require.False(t, rClone.hasRole(RoleSecondary)) 1301 require.Equal(t, ReplicationSetStateAbsent, rClone.State) 1302 }) 1303 1304 // Commit -> Replicating 1305 msgs, err = r.handleTableStatus(dest, &tablepb.TableStatus{ 1306 Span: tablepb.Span{TableID: tableID}, 1307 State: tablepb.TableStateReplicating, 1308 }) 1309 require.Nil(t, err) 1310 require.Len(t, msgs, 0) 1311 require.Equal(t, ReplicationSetStateReplicating, r.State) 1312 require.Equal(t, dest, r.Primary) 1313 require.False(t, r.hasRole(RoleSecondary)) 1314 1315 // Unknown capture shutdown has no effect. 1316 t.Run("UnknownCaptureShutdown", func(t *testing.T) { 1317 rClone := clone(r) 1318 msgs, affected, err = rClone.handleCaptureShutdown("unknown") 1319 require.Nil(t, err) 1320 require.False(t, affected) 1321 require.Len(t, msgs, 0) 1322 require.EqualValues(t, r, rClone) 1323 }) 1324 } 1325 1326 func TestReplicationSetCaptureShutdownAfterReconstructCommitState(t *testing.T) { 1327 t.Parallel() 1328 1329 // Reconstruct commit state 1330 from := "1" 1331 tableID := model.TableID(1) 1332 tableStatus := map[model.CaptureID]*tablepb.TableStatus{ 1333 from: {Span: tablepb.Span{TableID: tableID}, State: tablepb.TableStatePrepared}, 1334 } 1335 span := tablepb.Span{TableID: tableID} 1336 r, err := NewReplicationSet(span, 0, tableStatus, model.ChangeFeedID{}) 1337 require.Nil(t, err) 1338 require.Equal(t, ReplicationSetStateCommit, r.State) 1339 require.Equal(t, "", r.Primary) 1340 require.True(t, r.isInRole(from, RoleSecondary)) 1341 1342 // Commit -> Absent as there is no primary nor secondary. 1343 msg, affected, err := r.handleCaptureShutdown(from) 1344 require.Nil(t, err) 1345 require.True(t, affected) 1346 require.Empty(t, msg) 1347 require.Equal(t, ReplicationSetStateAbsent, r.State) 1348 require.Equal(t, "", r.Primary) 1349 require.False(t, r.hasRole(RoleSecondary)) 1350 } 1351 1352 func TestReplicationSetMoveTableWithHeartbeatResponse(t *testing.T) { 1353 t.Parallel() 1354 1355 tableID := model.TableID(1) 1356 span := tablepb.Span{TableID: tableID} 1357 r, err := NewReplicationSet(span, 0, nil, model.ChangeFeedID{}) 1358 require.Nil(t, err) 1359 1360 source := "1" 1361 dest := "2" 1362 r.State = ReplicationSetStateReplicating 1363 require.Nil(t, r.setCapture(source, RoleSecondary)) 1364 require.Nil(t, r.promoteSecondary(source)) 1365 1366 // Replicating -> Prepare 1367 msgs, err := r.handleMoveTable(dest) 1368 require.Nil(t, err) 1369 require.Len(t, msgs, 1) 1370 require.Equal(t, ReplicationSetStatePrepare, r.State) 1371 require.True(t, r.isInRole(dest, RoleSecondary)) 1372 require.Equal(t, source, r.Primary) 1373 1374 // Prepare -> Commit. 1375 msgs, err = r.handleTableStatus(dest, &tablepb.TableStatus{ 1376 Span: tablepb.Span{TableID: tableID}, 1377 State: tablepb.TableStatePrepared, 1378 }) 1379 require.Nil(t, err) 1380 require.Len(t, msgs, 1) 1381 require.Equal(t, ReplicationSetStateCommit, r.State) 1382 require.Equal(t, source, r.Primary) 1383 require.True(t, r.isInRole(dest, RoleSecondary)) 1384 1385 // Source updates it's table status 1386 // Source is removed. 1387 msgs, err = r.handleTableStatus(source, &tablepb.TableStatus{ 1388 Span: tablepb.Span{TableID: tableID}, 1389 State: tablepb.TableStateStopped, 1390 Checkpoint: tablepb.Checkpoint{ 1391 CheckpointTs: 3, 1392 ResolvedTs: 4, 1393 }, 1394 }) 1395 require.Nil(t, err) 1396 require.Len(t, msgs, 1) 1397 require.Equal(t, ReplicationSetStateCommit, r.State) 1398 require.Equal(t, dest, r.Primary) 1399 require.False(t, r.hasRole(RoleSecondary)) 1400 require.Equal(t, tablepb.Checkpoint{ 1401 CheckpointTs: 3, 1402 ResolvedTs: 4, 1403 }, r.Checkpoint) 1404 1405 // Source sends a heartbeat response. 1406 msgs, err = r.handleTableStatus(source, &tablepb.TableStatus{ 1407 Span: tablepb.Span{TableID: tableID}, 1408 State: tablepb.TableStateAbsent, 1409 }) 1410 require.Nil(t, err) 1411 require.Len(t, msgs, 0) 1412 require.Equal(t, ReplicationSetStateCommit, r.State) 1413 require.Equal(t, dest, r.Primary) 1414 require.False(t, r.hasRole(RoleSecondary)) 1415 require.Equal(t, tablepb.Checkpoint{ 1416 CheckpointTs: 3, 1417 ResolvedTs: 4, 1418 }, r.Checkpoint) 1419 1420 // Commit -> Replicating 1421 msgs, err = r.handleTableStatus(dest, &tablepb.TableStatus{ 1422 Span: tablepb.Span{TableID: tableID}, 1423 State: tablepb.TableStateReplicating, 1424 }) 1425 require.Nil(t, err) 1426 require.Len(t, msgs, 0) 1427 require.Equal(t, ReplicationSetStateReplicating, r.State) 1428 require.Equal(t, dest, r.Primary) 1429 require.False(t, r.hasRole(RoleSecondary)) 1430 } 1431 1432 func TestReplicationSetMarshalJSON(t *testing.T) { 1433 t.Parallel() 1434 1435 b, err := json.Marshal(ReplicationSet{State: ReplicationSetStateReplicating}) 1436 require.Nil(t, err) 1437 require.Contains(t, string(b), "Replicating", string(b)) 1438 } 1439 1440 func TestReplicationSetMoveTableSameDestCapture(t *testing.T) { 1441 t.Parallel() 1442 1443 tableID := model.TableID(1) 1444 span := tablepb.Span{TableID: tableID} 1445 r, err := NewReplicationSet(span, 0, nil, model.ChangeFeedID{}) 1446 require.Nil(t, err) 1447 1448 source := "1" 1449 dest := source 1450 r.State = ReplicationSetStateReplicating 1451 require.Nil(t, r.setCapture(source, RoleSecondary)) 1452 require.Nil(t, r.promoteSecondary(source)) 1453 1454 // Ignore move table. 1455 msgs, err := r.handleMoveTable(dest) 1456 require.Nil(t, err) 1457 require.Len(t, msgs, 0) 1458 require.Equal(t, ReplicationSetStateReplicating, r.State) 1459 require.False(t, r.hasRole(RoleSecondary)) 1460 require.Equal(t, source, r.Primary) 1461 } 1462 1463 func TestReplicationSetCommitRestart(t *testing.T) { 1464 t.Parallel() 1465 1466 // Primary has received remove table message and is currently stopping. 1467 tableStatus := map[model.CaptureID]*tablepb.TableStatus{ 1468 "1": { 1469 State: tablepb.TableStatePrepared, 1470 Checkpoint: tablepb.Checkpoint{}, 1471 }, 1472 "2": { 1473 State: tablepb.TableStateStopping, 1474 Checkpoint: tablepb.Checkpoint{}, 1475 }, 1476 } 1477 span := tablepb.Span{TableID: 0} 1478 r, err := NewReplicationSet(span, 0, tableStatus, model.ChangeFeedID{}) 1479 require.Nil(t, err) 1480 require.Equal(t, ReplicationSetStateCommit, r.State) 1481 require.EqualValues(t, RoleSecondary, r.Captures["1"]) 1482 require.Equal(t, "", r.Primary) 1483 require.Contains(t, r.Captures, "2") 1484 1485 // Can not promote to primary as there are other captures. 1486 msgs, err := r.handleTableStatus("1", &tablepb.TableStatus{ 1487 Span: tablepb.Span{TableID: 0}, 1488 State: tablepb.TableStatePrepared, 1489 }) 1490 require.Nil(t, err) 1491 require.Len(t, msgs, 0) 1492 require.Equal(t, ReplicationSetStateCommit, r.State) 1493 require.EqualValues(t, RoleSecondary, r.Captures["1"]) 1494 require.Equal(t, "", r.Primary) 1495 require.Contains(t, r.Captures, "2") 1496 1497 // Table status reported by other captures does not change replication set. 1498 msgs, err = r.handleTableStatus("2", &tablepb.TableStatus{ 1499 Span: tablepb.Span{TableID: 0}, 1500 State: tablepb.TableStateStopping, 1501 }) 1502 require.Nil(t, err) 1503 require.Len(t, msgs, 0) 1504 require.Equal(t, ReplicationSetStateCommit, r.State) 1505 require.EqualValues(t, RoleSecondary, r.Captures["1"]) 1506 require.Equal(t, "", r.Primary) 1507 require.Contains(t, r.Captures, "2") 1508 1509 // Only Stopped or Absent allows secondary to be promoted. 1510 rClone := clone(r) 1511 msgs, err = rClone.handleTableStatus("2", &tablepb.TableStatus{ 1512 Span: tablepb.Span{TableID: 0}, 1513 State: tablepb.TableStateAbsent, 1514 }) 1515 require.Nil(t, err) 1516 require.Len(t, msgs, 0) 1517 require.Equal(t, ReplicationSetStateCommit, rClone.State) 1518 require.EqualValues(t, RoleSecondary, rClone.Captures["1"]) 1519 require.Equal(t, "", rClone.Primary) 1520 require.NotContains(t, rClone.Captures, "2") 1521 msgs, err = r.handleTableStatus("2", &tablepb.TableStatus{ 1522 Span: tablepb.Span{TableID: 0}, 1523 State: tablepb.TableStateStopped, 1524 }) 1525 require.Nil(t, err) 1526 require.Len(t, msgs, 0) 1527 require.Equal(t, ReplicationSetStateCommit, r.State) 1528 require.EqualValues(t, RoleSecondary, r.Captures["1"]) 1529 require.Equal(t, "", r.Primary) 1530 require.NotContains(t, r.Captures, "2") 1531 1532 // No other captures, promote secondary. 1533 msgs, err = r.handleTableStatus("1", &tablepb.TableStatus{ 1534 Span: tablepb.Span{TableID: 0}, 1535 State: tablepb.TableStatePrepared, 1536 }) 1537 require.Nil(t, err) 1538 require.Len(t, msgs, 1) 1539 require.Equal(t, "1", msgs[0].To) 1540 require.False(t, msgs[0].DispatchTableRequest.GetAddTable().IsSecondary) 1541 require.Equal(t, ReplicationSetStateCommit, r.State) 1542 require.Equal(t, "1", r.Primary) 1543 require.False(t, r.hasRole(RoleSecondary)) 1544 } 1545 1546 func TestReplicationSetRemoveRestart(t *testing.T) { 1547 t.Parallel() 1548 1549 // Primary has received remove table message and is currently stopping. 1550 tableStatus := map[model.CaptureID]*tablepb.TableStatus{ 1551 "1": { 1552 State: tablepb.TableStateStopping, 1553 Checkpoint: tablepb.Checkpoint{}, 1554 }, 1555 "2": { 1556 State: tablepb.TableStateStopping, 1557 Checkpoint: tablepb.Checkpoint{}, 1558 }, 1559 } 1560 span := tablepb.Span{TableID: 0} 1561 r, err := NewReplicationSet(span, 0, tableStatus, model.ChangeFeedID{}) 1562 require.Nil(t, err) 1563 require.Equal(t, ReplicationSetStateRemoving, r.State) 1564 require.False(t, r.hasRole(RoleSecondary)) 1565 require.Equal(t, "", r.Primary) 1566 require.Contains(t, r.Captures, "1") 1567 require.Contains(t, r.Captures, "2") 1568 require.False(t, r.hasRemoved()) 1569 1570 // A capture reports its status. 1571 msgs, err := r.handleTableStatus("2", &tablepb.TableStatus{ 1572 Span: tablepb.Span{TableID: 0}, 1573 State: tablepb.TableStateStopping, 1574 }) 1575 require.Nil(t, err) 1576 require.Len(t, msgs, 0) 1577 require.False(t, r.hasRemoved()) 1578 1579 // A capture stopped. 1580 msgs, err = r.handleTableStatus("2", &tablepb.TableStatus{ 1581 Span: tablepb.Span{TableID: 0}, 1582 State: tablepb.TableStateStopped, 1583 }) 1584 require.Nil(t, err) 1585 require.Len(t, msgs, 0) 1586 require.False(t, r.hasRemoved()) 1587 1588 // Another capture stopped too. 1589 msgs, err = r.handleTableStatus("1", &tablepb.TableStatus{ 1590 Span: tablepb.Span{TableID: 0}, 1591 State: tablepb.TableStateAbsent, 1592 }) 1593 require.Nil(t, err) 1594 require.Len(t, msgs, 0) 1595 require.True(t, r.hasRemoved()) 1596 } 1597 1598 func TestReplicationSetHeap_Len(t *testing.T) { 1599 t.Parallel() 1600 1601 h := NewReplicationSetHeap(defaultSlowTableHeapSize) 1602 require.Equal(t, 0, h.Len()) 1603 1604 h = append(h, &ReplicationSet{Span: spanz.TableIDToComparableSpan(0)}) 1605 require.Equal(t, 1, h.Len()) 1606 1607 h = append(h, &ReplicationSet{Span: spanz.TableIDToComparableSpan(1)}) 1608 require.Equal(t, 2, h.Len()) 1609 } 1610 1611 func TestReplicationSetHeap_Less(t *testing.T) { 1612 t.Parallel() 1613 1614 h := NewReplicationSetHeap(defaultSlowTableHeapSize) 1615 h = append(h, &ReplicationSet{ 1616 Span: spanz.TableIDToComparableSpan(0), 1617 Checkpoint: tablepb.Checkpoint{CheckpointTs: 1}, 1618 }) 1619 h = append(h, &ReplicationSet{ 1620 Span: spanz.TableIDToComparableSpan(1), 1621 Checkpoint: tablepb.Checkpoint{CheckpointTs: 2, ResolvedTs: 3}, 1622 }) 1623 h = append(h, &ReplicationSet{ 1624 Span: spanz.TableIDToComparableSpan(2), 1625 Checkpoint: tablepb.Checkpoint{CheckpointTs: 2, ResolvedTs: 4}, 1626 }) 1627 require.True(t, h.Less(1, 0)) 1628 require.True(t, h.Less(2, 1)) 1629 } 1630 1631 func TestReplicationSetHeap_Basic(t *testing.T) { 1632 t.Parallel() 1633 1634 h := NewReplicationSetHeap(defaultSlowTableHeapSize) 1635 heap.Init(&h) 1636 heap.Push(&h, &ReplicationSet{ 1637 Span: spanz.TableIDToComparableSpan(0), 1638 Checkpoint: tablepb.Checkpoint{CheckpointTs: 1}, 1639 }) 1640 heap.Push(&h, &ReplicationSet{ 1641 Span: spanz.TableIDToComparableSpan(1), 1642 Checkpoint: tablepb.Checkpoint{CheckpointTs: 2}, 1643 }) 1644 require.Equal(t, 2, h.Len()) 1645 1646 require.Equal(t, int64(1), heap.Pop(&h).(*ReplicationSet).Span.TableID) 1647 require.Equal(t, 1, h.Len()) 1648 1649 require.Equal(t, int64(0), heap.Pop(&h).(*ReplicationSet).Span.TableID) 1650 require.Equal(t, 0, h.Len()) 1651 } 1652 1653 // TestReplicationSetHeap_MinK tests that the heap can be 1654 // used to keep the min K elements. 1655 func TestReplicationSetHeap_MinK(t *testing.T) { 1656 t.Parallel() 1657 1658 // K = defaultSlowTableHeapSize 1659 h := NewReplicationSetHeap(defaultSlowTableHeapSize) 1660 heap.Init(&h) 1661 1662 for i := 2 * defaultSlowTableHeapSize; i > 0; i-- { 1663 replicationSet := &ReplicationSet{ 1664 Span: spanz.TableIDToComparableSpan(int64(i)), 1665 Checkpoint: tablepb.Checkpoint{CheckpointTs: uint64(i)}, 1666 } 1667 heap.Push(&h, replicationSet) 1668 if h.Len() > defaultSlowTableHeapSize { 1669 heap.Pop(&h) 1670 } 1671 } 1672 1673 require.Equal(t, defaultSlowTableHeapSize, h.Len()) 1674 1675 expectedTables := make([]int64, 0) 1676 for i := defaultSlowTableHeapSize; i > 0; i-- { 1677 expectedTables = append(expectedTables, int64(i)) 1678 } 1679 1680 tables := make([]model.TableID, 0) 1681 tableCounts := h.Len() 1682 for i := 0; i < tableCounts; i++ { 1683 element := heap.Pop(&h).(*ReplicationSet) 1684 t.Log(element.Span) 1685 tables = append(tables, element.Span.TableID) 1686 } 1687 require.Equal(t, expectedTables, tables) 1688 require.Equal(t, 0, h.Len()) 1689 } 1690 1691 func TestUpdateCheckpointAndStats(t *testing.T) { 1692 cases := []struct { 1693 checkpoint tablepb.Checkpoint 1694 stats tablepb.Stats 1695 }{ 1696 { 1697 checkpoint: tablepb.Checkpoint{ 1698 CheckpointTs: 1, 1699 ResolvedTs: 2, 1700 }, 1701 stats: tablepb.Stats{}, 1702 }, 1703 { 1704 checkpoint: tablepb.Checkpoint{ 1705 CheckpointTs: 2, 1706 ResolvedTs: 1, 1707 }, 1708 stats: tablepb.Stats{}, 1709 }, 1710 } 1711 r := &ReplicationSet{} 1712 for _, c := range cases { 1713 r.updateCheckpointAndStats(c.checkpoint, c.stats) 1714 } 1715 }