github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/engine/framework/internal/master/worker_manager_test.go (about) 1 // Copyright 2022 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package master 15 16 import ( 17 "context" 18 "sync" 19 "testing" 20 "time" 21 22 "github.com/pingcap/log" 23 "github.com/pingcap/tiflow/engine/framework/config" 24 "github.com/pingcap/tiflow/engine/framework/logutil" 25 "github.com/pingcap/tiflow/engine/framework/metadata" 26 frameModel "github.com/pingcap/tiflow/engine/framework/model" 27 "github.com/pingcap/tiflow/engine/framework/statusutil" 28 "github.com/pingcap/tiflow/engine/pkg/clock" 29 pkgOrm "github.com/pingcap/tiflow/engine/pkg/orm" 30 "github.com/pingcap/tiflow/engine/pkg/p2p" 31 "github.com/pingcap/tiflow/pkg/errors" 32 "github.com/stretchr/testify/require" 33 "go.uber.org/zap" 34 "golang.org/x/time/rate" 35 ) 36 37 type workerManageTestSuite struct { 38 manager *WorkerManager 39 masterNode p2p.NodeID 40 meta pkgOrm.Client 41 messageSender p2p.MessageSender 42 clock *clock.Mock 43 44 events map[frameModel.WorkerID]*masterEvent 45 } 46 47 func (s *workerManageTestSuite) AdvanceClockBy(duration time.Duration) { 48 s.clock.Add(duration) 49 } 50 51 func (s *workerManageTestSuite) SimulateHeartbeat( 52 workerID frameModel.WorkerID, 53 epoch frameModel.Epoch, workerEpoch frameModel.Epoch, 54 node p2p.NodeID, isFinished bool, 55 ) { 56 s.manager.HandleHeartbeat(&frameModel.HeartbeatPingMessage{ 57 SendTime: s.clock.Mono(), 58 FromWorkerID: workerID, 59 Epoch: epoch, 60 WorkerEpoch: workerEpoch, 61 IsFinished: isFinished, 62 }, node) 63 } 64 65 func (s *workerManageTestSuite) SimulateWorkerUpdateStatus( 66 workerID frameModel.WorkerID, status *frameModel.WorkerStatus, epoch frameModel.Epoch, 67 ) error { 68 err := s.meta.UpsertWorker(context.Background(), status) 69 if err != nil { 70 return err 71 } 72 73 s.manager.OnWorkerStatusUpdateMessage(&statusutil.WorkerStatusMessage{ 74 Worker: workerID, 75 MasterEpoch: epoch, 76 Status: status, 77 }) 78 return nil 79 } 80 81 func (s *workerManageTestSuite) PutMeta(workerID frameModel.WorkerID, status *frameModel.WorkerStatus) error { 82 status.JobID = "master-1" 83 status.ID = workerID 84 return s.meta.UpsertWorker(context.Background(), status) 85 } 86 87 func (s *workerManageTestSuite) onWorkerOnline(ctx context.Context, handle WorkerHandle) error { 88 if event, exists := s.events[handle.ID()]; exists { 89 log.Warn("found unexpected event", zap.Any("event", event)) 90 return errors.New("unexpected event already exists") 91 } 92 s.events[handle.ID()] = &masterEvent{ 93 Tp: workerOnlineEvent, 94 Handle: handle, 95 } 96 return nil 97 } 98 99 func (s *workerManageTestSuite) onWorkerOffline(ctx context.Context, handle WorkerHandle, err error) error { 100 if event, exists := s.events[handle.ID()]; exists { 101 log.Warn("found unexpected event", zap.Any("event", event)) 102 return errors.New("unexpected event already exists") 103 } 104 s.events[handle.ID()] = &masterEvent{ 105 Tp: workerOfflineEvent, 106 Handle: handle, 107 Err: err, 108 } 109 return nil 110 } 111 112 func (s *workerManageTestSuite) onWorkerStatusUpdated(ctx context.Context, handle WorkerHandle) error { 113 if event, exists := s.events[handle.ID()]; exists { 114 log.Warn("found unexpected event", zap.Any("event", event)) 115 return errors.New("unexpected event already exists") 116 } 117 s.events[handle.ID()] = &masterEvent{ 118 Tp: workerStatusUpdatedEvent, 119 Handle: handle, 120 } 121 return nil 122 } 123 124 func (s *workerManageTestSuite) onWorkerDispatched(ctx context.Context, handle WorkerHandle, err error) error { 125 if event, exists := s.events[handle.ID()]; exists { 126 log.Warn("found unexpected event", zap.Any("event", event)) 127 return errors.New("unexpected event already exists") 128 } 129 s.events[handle.ID()] = &masterEvent{ 130 Tp: workerDispatchFailedEvent, 131 Handle: handle, 132 Err: err, 133 } 134 return nil 135 } 136 137 func (s *workerManageTestSuite) WaitForEvent(t *testing.T, workerID frameModel.WorkerID) *masterEvent { 138 timeoutCtx, cancel := context.WithTimeout(context.Background(), 2*time.Second) 139 defer cancel() 140 141 rl := rate.NewLimiter(rate.Every(10*time.Millisecond), 1) 142 143 for { 144 select { 145 case <-timeoutCtx.Done(): 146 t.Fatalf("waitForEventTimed out, workerID: %s", workerID) 147 default: 148 } 149 150 // The Tick should return very quickly. 151 tickCtx, cancel := context.WithTimeout(timeoutCtx, 100*time.Millisecond) 152 err := s.manager.Tick(tickCtx) 153 cancel() 154 require.NoError(t, err) 155 156 event, exists := s.events[workerID] 157 if !exists { 158 err := rl.Wait(timeoutCtx) 159 require.NoError(t, err) 160 161 s.AdvanceClockBy(1 * time.Second) 162 continue 163 } 164 165 require.Equal(t, workerID, event.Handle.ID()) 166 delete(s.events, workerID) 167 return event 168 } 169 } 170 171 func (s *workerManageTestSuite) AssertNoEvents(t *testing.T, workerID frameModel.WorkerID, waitFor time.Duration) { 172 timeoutCtx, cancel := context.WithTimeout(context.Background(), waitFor) 173 defer cancel() 174 175 rl := rate.NewLimiter(rate.Every(10*time.Millisecond), 1) 176 177 for { 178 select { 179 case <-timeoutCtx.Done(): 180 return 181 default: 182 } 183 184 // The Tick should return very quickly. 185 tickCtx, cancel := context.WithTimeout(timeoutCtx, 100*time.Millisecond) 186 err := s.manager.Tick(tickCtx) 187 cancel() 188 if err != nil { 189 if context.DeadlineExceeded == errors.Cause(err) { 190 return 191 } 192 require.NoError(t, err) 193 } 194 195 _, exists := s.events[workerID] 196 require.False(t, exists) 197 198 _ = rl.Wait(timeoutCtx) 199 } 200 } 201 202 func (s *workerManageTestSuite) Close() { 203 s.manager.Close() 204 // Prevents SQL connection leak. 205 _ = s.meta.Close() 206 } 207 208 func NewWorkerManageTestSuite(isInit bool) *workerManageTestSuite { 209 cli, err := pkgOrm.NewMockClient() 210 if err != nil { 211 panic(err) 212 } 213 ret := &workerManageTestSuite{ 214 meta: cli, 215 masterNode: "executor-0", 216 messageSender: p2p.NewMockMessageSender(), 217 clock: clock.NewMock(), 218 events: make(map[frameModel.WorkerID]*masterEvent), 219 } 220 masterID := "master-1" 221 logger := logutil.WithMasterID(log.L(), masterID) 222 manager := NewWorkerManager( 223 masterID, 224 1, 225 ret.meta, 226 ret.messageSender, 227 ret.onWorkerOnline, 228 ret.onWorkerOffline, 229 ret.onWorkerStatusUpdated, 230 ret.onWorkerDispatched, 231 isInit, 232 config.DefaultTimeoutConfig(), 233 ret.clock). 234 WithLogger(logger) 235 ret.manager = manager 236 return ret 237 } 238 239 func TestCreateWorkerAndWorkerOnline(t *testing.T) { 240 t.Parallel() 241 242 suite := NewWorkerManageTestSuite(true) 243 wEpoch := int64(2) 244 suite.manager.BeforeStartingWorker("worker-1", "executor-1", wEpoch) 245 246 suite.SimulateHeartbeat("worker-1", 1, wEpoch, "executor-1", false) 247 suite.SimulateHeartbeat("worker-1", 1, wEpoch, "executor-1", false) 248 suite.SimulateHeartbeat("worker-1", 1, wEpoch, "executor-1", false) 249 250 event := suite.WaitForEvent(t, "worker-1") 251 require.Equal(t, workerOnlineEvent, event.Tp) 252 suite.Close() 253 } 254 255 func TestCreateWorkerAndWorkerTimesOut(t *testing.T) { 256 t.Parallel() 257 258 suite := NewWorkerManageTestSuite(true) 259 suite.manager.BeforeStartingWorker("worker-1", "executor-1", 2) 260 suite.AdvanceClockBy(30 * time.Second) 261 suite.AdvanceClockBy(30 * time.Second) 262 suite.AdvanceClockBy(30 * time.Second) 263 264 event := suite.WaitForEvent(t, "worker-1") 265 require.Equal(t, workerOfflineEvent, event.Tp) 266 require.NotNil(t, event.Handle.GetTombstone()) 267 268 suite.AssertNoEvents(t, "worker-1", 500*time.Millisecond) 269 suite.Close() 270 } 271 272 func TestCreateWorkerPredispatchFailed(t *testing.T) { 273 t.Parallel() 274 275 suite := NewWorkerManageTestSuite(true) 276 suite.manager.AbortCreatingWorker("worker-1", errors.New("injected error")) 277 278 event := suite.WaitForEvent(t, "worker-1") 279 require.Equal(t, workerDispatchFailedEvent, event.Tp) 280 require.NotNil(t, event.Handle.GetTombstone()) 281 require.Error(t, event.Err) 282 require.Regexp(t, ".*injected error.*", event.Err) 283 284 suite.AssertNoEvents(t, "worker-1", 500*time.Millisecond) 285 suite.Close() 286 } 287 288 func TestCreateWorkerAndWorkerStatusUpdatedAndTimesOut(t *testing.T) { 289 t.Parallel() 290 291 suite := NewWorkerManageTestSuite(true) 292 wEpoch := int64(2) 293 suite.manager.BeforeStartingWorker("worker-1", "executor-1", wEpoch) 294 295 suite.SimulateHeartbeat("worker-1", 1, wEpoch, "executor-1", false) 296 297 event := suite.WaitForEvent(t, "worker-1") 298 require.Equal(t, workerOnlineEvent, event.Tp) 299 300 err := suite.SimulateWorkerUpdateStatus("worker-1", &frameModel.WorkerStatus{ 301 State: frameModel.WorkerStateFinished, 302 }, 1) 303 require.NoError(t, err) 304 305 event = suite.WaitForEvent(t, "worker-1") 306 require.Equal(t, workerStatusUpdatedEvent, event.Tp) 307 require.Equal(t, frameModel.WorkerStateFinished, event.Handle.Status().State) 308 309 suite.AdvanceClockBy(30 * time.Second) 310 event = suite.WaitForEvent(t, "worker-1") 311 require.Equal(t, workerOfflineEvent, event.Tp) 312 require.NotNil(t, event.Handle.GetTombstone()) 313 require.True(t, errors.Is(event.Err, errors.ErrWorkerFinish)) 314 315 suite.Close() 316 } 317 318 func TestRecoverAfterFailover(t *testing.T) { 319 t.Parallel() 320 321 ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) 322 defer cancel() 323 324 suite := NewWorkerManageTestSuite(false) 325 err := suite.PutMeta("worker-1", &frameModel.WorkerStatus{ 326 State: frameModel.WorkerStateNormal, 327 Epoch: 11, 328 }) 329 require.NoError(t, err) 330 err = suite.PutMeta("worker-2", &frameModel.WorkerStatus{ 331 State: frameModel.WorkerStateNormal, 332 Epoch: 12, 333 }) 334 require.NoError(t, err) 335 err = suite.PutMeta("worker-3", &frameModel.WorkerStatus{ 336 State: frameModel.WorkerStateNormal, 337 Epoch: 13, 338 }) 339 require.NoError(t, err) 340 err = suite.PutMeta("worker-4", &frameModel.WorkerStatus{ 341 State: frameModel.WorkerStateNormal, 342 Epoch: 14, 343 }) 344 require.NoError(t, err) 345 346 doneCh := make(chan struct{}) 347 go func() { 348 defer close(doneCh) 349 err := suite.manager.InitAfterRecover(ctx) 350 require.NoError(t, err) 351 }() 352 353 require.Eventually(t, func() bool { 354 suite.SimulateHeartbeat("worker-1", 1, 11, "executor-1", false) 355 suite.SimulateHeartbeat("worker-2", 1, 12, "executor-2", false) 356 suite.SimulateHeartbeat("worker-3", 1, 13, "executor-3", false) 357 358 select { 359 case <-doneCh: 360 return true 361 default: 362 } 363 suite.AdvanceClockBy(1 * time.Second) 364 return false 365 }, 5*time.Second, 10*time.Millisecond) 366 367 require.True(t, suite.manager.IsInitialized()) 368 require.Len(t, suite.manager.GetWorkers(), 4) 369 require.Contains(t, suite.manager.GetWorkers(), "worker-1") 370 require.Contains(t, suite.manager.GetWorkers(), "worker-2") 371 require.Contains(t, suite.manager.GetWorkers(), "worker-3") 372 require.Contains(t, suite.manager.GetWorkers(), "worker-4") 373 require.Nil(t, suite.manager.GetWorkers()["worker-1"].GetTombstone()) 374 require.Nil(t, suite.manager.GetWorkers()["worker-2"].GetTombstone()) 375 require.Nil(t, suite.manager.GetWorkers()["worker-3"].GetTombstone()) 376 require.NotNil(t, suite.manager.GetWorkers()["worker-4"].GetTombstone()) 377 suite.Close() 378 } 379 380 func TestRecoverAfterFailoverFast(t *testing.T) { 381 t.Parallel() 382 383 ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) 384 defer cancel() 385 386 suite := NewWorkerManageTestSuite(false) 387 wEpoch := int64(100) 388 err := suite.PutMeta("worker-1", &frameModel.WorkerStatus{ 389 State: frameModel.WorkerStateNormal, 390 Epoch: wEpoch, 391 }) 392 require.NoError(t, err) 393 394 doneCh := make(chan struct{}) 395 go func() { 396 defer close(doneCh) 397 err := suite.manager.InitAfterRecover(ctx) 398 require.NoError(t, err) 399 }() 400 401 require.Eventually(t, func() bool { 402 suite.SimulateHeartbeat("worker-1", 1, wEpoch, "executor-1", false) 403 select { 404 case <-doneCh: 405 return true 406 default: 407 } 408 return false 409 }, 1*time.Second, 10*time.Millisecond) 410 411 require.True(t, suite.manager.IsInitialized()) 412 require.Len(t, suite.manager.GetWorkers(), 1) 413 require.Contains(t, suite.manager.GetWorkers(), "worker-1") 414 suite.Close() 415 } 416 417 func TestRecoverWithNoWorker(t *testing.T) { 418 t.Parallel() 419 420 ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) 421 defer cancel() 422 423 suite := NewWorkerManageTestSuite(false) 424 425 // Since there is no worker info in the metastore, 426 // recovering should be very fast. 427 // Since we are using a mock clock, and we are NOT advancing it, 428 // InitAfterRecover returning at all would indicate a successful test. 429 err := suite.manager.InitAfterRecover(ctx) 430 require.NoError(t, err) 431 432 suite.Close() 433 } 434 435 func TestCleanTombstone(t *testing.T) { 436 t.Parallel() 437 438 ctx := context.Background() 439 440 suite := NewWorkerManageTestSuite(true) 441 suite.manager.BeforeStartingWorker("worker-1", "executor-1", 2) 442 suite.AdvanceClockBy(30 * time.Second) 443 suite.AdvanceClockBy(30 * time.Second) 444 suite.AdvanceClockBy(30 * time.Second) 445 446 event := suite.WaitForEvent(t, "worker-1") 447 require.Equal(t, workerOfflineEvent, event.Tp) 448 require.NotNil(t, event.Handle.GetTombstone()) 449 err := event.Handle.GetTombstone().CleanTombstone(ctx) 450 require.NoError(t, err) 451 452 workerMetaClient := metadata.NewWorkerStatusClient("master-1", suite.meta) 453 _, err = workerMetaClient.Load(ctx, "worker-1") 454 // Asserts that the meta for the worker is indeed deleted. 455 require.Error(t, err) 456 require.Regexp(t, ".*ErrMetaEntryNotFound", err) 457 458 // CleanTombstone should be idempotent for robustness. 459 err = event.Handle.GetTombstone().CleanTombstone(ctx) 460 require.NoError(t, err) 461 462 // Recreating a worker with the same name should work fine. 463 suite.manager.BeforeStartingWorker("worker-1", "executor-1", 10) 464 465 suite.Close() 466 } 467 468 func TestWorkerGracefulExit(t *testing.T) { 469 t.Parallel() 470 471 suite := NewWorkerManageTestSuite(true) 472 wEpoch := int64(2) 473 suite.manager.BeforeStartingWorker("worker-1", "executor-1", wEpoch) 474 475 suite.SimulateHeartbeat("worker-1", 1, wEpoch, "executor-1", false) 476 suite.SimulateHeartbeat("worker-1", 1, wEpoch, "executor-1", false) 477 suite.SimulateHeartbeat("worker-1", 1, wEpoch, "executor-1", false) 478 479 event := suite.WaitForEvent(t, "worker-1") 480 require.Equal(t, workerOnlineEvent, event.Tp) 481 482 suite.SimulateHeartbeat("worker-1", 1, wEpoch, "executor-1", true) 483 event = suite.WaitForEvent(t, "worker-1") 484 require.Equal(t, workerOfflineEvent, event.Tp) 485 486 suite.Close() 487 } 488 489 func TestWorkerGracefulExitOnFirstHeartbeat(t *testing.T) { 490 t.Parallel() 491 492 suite := NewWorkerManageTestSuite(true) 493 wEpoch := int64(2) 494 suite.manager.BeforeStartingWorker("worker-1", "executor-1", wEpoch) 495 496 suite.SimulateHeartbeat("worker-1", 1, wEpoch, "executor-1", true) 497 498 // Now we expect there to be both workerOnlineEvent and workerOfflineEvent, 499 // in that order. 500 event := suite.WaitForEvent(t, "worker-1") 501 require.Equal(t, workerOnlineEvent, event.Tp) 502 event = suite.WaitForEvent(t, "worker-1") 503 require.Equal(t, workerOfflineEvent, event.Tp) 504 505 suite.Close() 506 } 507 508 func TestWorkerGracefulExitAfterFailover(t *testing.T) { 509 t.Parallel() 510 511 ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) 512 defer cancel() 513 514 suite := NewWorkerManageTestSuite(false) 515 wEpoch := int64(2) 516 err := suite.PutMeta("worker-1", &frameModel.WorkerStatus{ 517 State: frameModel.WorkerStateNormal, 518 Epoch: wEpoch, 519 }) 520 require.NoError(t, err) 521 522 doneCh := make(chan struct{}) 523 go func() { 524 defer close(doneCh) 525 err := suite.manager.InitAfterRecover(ctx) 526 require.NoError(t, err) 527 }() 528 529 require.Eventually(t, func() bool { 530 suite.SimulateHeartbeat("worker-1", 1, wEpoch, "executor-1", true) 531 select { 532 case <-doneCh: 533 return true 534 default: 535 } 536 suite.AdvanceClockBy(1 * time.Second) 537 return false 538 }, 1*time.Second, 10*time.Millisecond) 539 540 require.True(t, suite.manager.IsInitialized()) 541 require.Len(t, suite.manager.GetWorkers(), 1) 542 require.Contains(t, suite.manager.GetWorkers(), "worker-1") 543 require.NotNil(t, suite.manager.GetWorkers()["worker-1"].GetTombstone()) 544 suite.Close() 545 } 546 547 func TestWorkerSendsStaleHeartbeat(t *testing.T) { 548 t.Parallel() 549 550 suite := NewWorkerManageTestSuite(true) 551 wEpoch := int64(2) 552 suite.manager.BeforeStartingWorker("worker-1", "executor-1", wEpoch) 553 554 suite.SimulateHeartbeat("worker-1", 1, wEpoch, "executor-1", false) 555 suite.SimulateHeartbeat("worker-1", 1, wEpoch, "executor-1", false) 556 557 event := suite.WaitForEvent(t, "worker-1") 558 require.Equal(t, workerOnlineEvent, event.Tp) 559 560 ctx, cancel := context.WithCancel(context.Background()) 561 var wg sync.WaitGroup 562 wg.Add(1) 563 go func() { 564 defer wg.Done() 565 for { 566 select { 567 case <-ctx.Done(): 568 return 569 case <-time.After(time.Millisecond * 20): 570 suite.SimulateHeartbeat("worker-1", 1, wEpoch-1, "executor-1", false) 571 } 572 } 573 }() 574 575 event = suite.WaitForEvent(t, "worker-1") 576 require.Equal(t, workerOfflineEvent, event.Tp) 577 578 suite.Close() 579 cancel() 580 wg.Wait() 581 }