github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/engine/framework/master.go (about) 1 // Copyright 2022 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package framework 15 16 import ( 17 "bytes" 18 "context" 19 "encoding/json" 20 "sync" 21 "time" 22 23 "github.com/BurntSushi/toml" 24 "github.com/pingcap/log" 25 "github.com/pingcap/tiflow/engine/framework/config" 26 "github.com/pingcap/tiflow/engine/framework/internal/master" 27 frameLog "github.com/pingcap/tiflow/engine/framework/logutil" 28 "github.com/pingcap/tiflow/engine/framework/metadata" 29 frameModel "github.com/pingcap/tiflow/engine/framework/model" 30 "github.com/pingcap/tiflow/engine/framework/statusutil" 31 "github.com/pingcap/tiflow/engine/pkg/client" 32 "github.com/pingcap/tiflow/engine/pkg/clock" 33 dcontext "github.com/pingcap/tiflow/engine/pkg/context" 34 "github.com/pingcap/tiflow/engine/pkg/deps" 35 "github.com/pingcap/tiflow/engine/pkg/errctx" 36 resModel "github.com/pingcap/tiflow/engine/pkg/externalresource/model" 37 "github.com/pingcap/tiflow/engine/pkg/meta" 38 metaModel "github.com/pingcap/tiflow/engine/pkg/meta/model" 39 pkgOrm "github.com/pingcap/tiflow/engine/pkg/orm" 40 "github.com/pingcap/tiflow/engine/pkg/p2p" 41 "github.com/pingcap/tiflow/engine/pkg/promutil" 42 "github.com/pingcap/tiflow/engine/pkg/quota" 43 "github.com/pingcap/tiflow/engine/pkg/tenant" 44 "github.com/pingcap/tiflow/pkg/errors" 45 "github.com/pingcap/tiflow/pkg/label" 46 "github.com/pingcap/tiflow/pkg/logutil" 47 "github.com/pingcap/tiflow/pkg/uuid" 48 "go.uber.org/atomic" 49 "go.uber.org/dig" 50 "go.uber.org/zap" 51 ) 52 53 // Master defines a basic interface that can run in dataflow engine runtime 54 type Master interface { 55 Init(ctx context.Context) error 56 Poll(ctx context.Context) error 57 MasterID() frameModel.MasterID 58 Close(ctx context.Context) error 59 Stop(ctx context.Context) error 60 NotifyExit(ctx context.Context, errIn error) error 61 } 62 63 // MasterImpl defines the interface to implement a master, business logic can be 64 // added in the functions of this interface 65 type MasterImpl interface { 66 // InitImpl is called at the first time the MasterImpl instance is initialized 67 // after OnOpenAPIInitialized. When InitImpl returns without error, framework 68 // will try to persist an internal state so further failover will call OnMasterRecovered 69 // rather than InitImpl. 70 // Return: 71 // - error to let the framework call CloseImpl, and framework may retry InitImpl 72 // later for some times. For non-retryable failure, business logic should 73 // call Exit. 74 // Concurrent safety: 75 // - this function is not concurrent with other callbacks. 76 InitImpl(ctx context.Context) error 77 78 // OnMasterRecovered is called when the MasterImpl instance has failover from 79 // error by framework. For this MasterImpl instance, it's called after OnOpenAPIInitialized. 80 // Return: 81 // - error to let the framework call CloseImpl. 82 // Concurrent safety: 83 // - this function is not concurrent with other callbacks. 84 OnMasterRecovered(ctx context.Context) error 85 86 // Tick is called on a fixed interval after MasterImpl's InitImpl or OnMasterRecovered, 87 // business logic can do some periodic tasks here. 88 // Return: 89 // - error to let the framework call CloseImpl. 90 // Concurrent safety: 91 // - this function may be concurrently called with other callbacks except for 92 // Tick itself, OnOpenAPIInitialized, InitImpl, OnMasterRecovered, CloseImpl, 93 // StopImpl. 94 Tick(ctx context.Context) error 95 96 // OnWorkerDispatched is called when the asynchronized action of CreateWorker 97 // is finished. Only after OnWorkerDispatched, OnWorkerOnline and OnWorkerStatusUpdated 98 // of the same worker may be called. 99 // Return: 100 // - error to let the framework call CloseImpl. 101 // Concurrent safety: 102 // - this function may be concurrently called with another worker's OnWorkerXXX, 103 // Tick, CloseImpl, StopImpl, OnCancel. 104 OnWorkerDispatched(worker WorkerHandle, result error) error 105 106 // OnWorkerOnline is called when the first heartbeat for a worker is received. 107 // NOTE: OnWorkerOffline can appear without OnWorkerOnline 108 // Return: 109 // - error to let the framework call CloseImpl. 110 // Concurrent safety: 111 // - this function may be concurrently called with another worker's OnWorkerXXX, 112 // Tick, CloseImpl, StopImpl, OnCancel, the same worker's OnWorkerStatusUpdated. 113 OnWorkerOnline(worker WorkerHandle) error 114 115 // OnWorkerOffline is called as the consequence of worker's Exit or heartbeat 116 // timed out. It's the last callback function among OnWorkerXXX for a worker. 117 // Return: 118 // - error to let the framework call CloseImpl. 119 // Concurrent safety: 120 // - this function may be concurrently called with another worker's OnWorkerXXX, 121 // Tick, CloseImpl, StopImpl, OnCancel. 122 OnWorkerOffline(worker WorkerHandle, reason error) error 123 124 // OnWorkerMessage is called when a customized message is received. 125 OnWorkerMessage(worker WorkerHandle, topic p2p.Topic, message interface{}) error 126 127 // OnWorkerStatusUpdated is called as the consequence of worker's UpdateStatus. 128 // Return: 129 // - error to let the framework call CloseImpl. 130 // Concurrent safety: 131 // - this function may be concurrently called with another worker's OnWorkerXXX, 132 // Tick, CloseImpl, StopImpl, OnCancel, the same worker's OnWorkerOnline. 133 OnWorkerStatusUpdated(worker WorkerHandle, newStatus *frameModel.WorkerStatus) error 134 135 // CloseImpl is called as the consequence of returning error from InitImpl, 136 // OnMasterRecovered or Tick, the Tick will be stopped after entering this function. 137 // And framework may try to create a new masterImpl instance afterwards. 138 // Business logic is expected to release resources here, but business developer 139 // should be aware that when the runtime is crashed, CloseImpl has no time to 140 // be called. 141 // TODO: no other callbacks will be called after and concurrent with CloseImpl 142 // Concurrent safety: 143 // - this function may be concurrently called with OnWorkerMessage, OnCancel, 144 // OnWorkerDispatched, OnWorkerOnline, OnWorkerOffline, OnWorkerStatusUpdated. 145 CloseImpl(ctx context.Context) 146 147 // StopImpl is called the consequence of business logic calls Exit. Tick will 148 // be stopped after entering this function, and framework will treat this MasterImpl 149 // as non-recoverable, 150 // There's at most one invocation to StopImpl after Exit. If the runtime is 151 // crashed, StopImpl has no time to be called. 152 // Concurrent safety: 153 // - this function may be concurrently called with OnWorkerMessage, OnCancel, 154 // OnWorkerDispatched, OnWorkerOnline, OnWorkerOffline, OnWorkerStatusUpdated. 155 StopImpl(ctx context.Context) 156 } 157 158 const ( 159 createWorkerWaitQuotaTimeout = 5 * time.Second 160 createWorkerTimeout = 10 * time.Second 161 maxCreateWorkerConcurrency = 100 162 ) 163 164 // CreateWorkerOpt specifies an option for creating a worker. 165 type CreateWorkerOpt = master.CreateWorkerOpt 166 167 // CreateWorkerWithResourceRequirements specifies the resource requirement of a worker. 168 func CreateWorkerWithResourceRequirements(resources ...resModel.ResourceID) CreateWorkerOpt { 169 return master.CreateWorkerWithResourceRequirements(resources...) 170 } 171 172 // CreateWorkerWithSelectors specifies the selectors used to dispatch the worker. 173 func CreateWorkerWithSelectors(selectors ...*label.Selector) CreateWorkerOpt { 174 return master.CreateWorkerWithSelectors(selectors...) 175 } 176 177 // BaseMaster defines the master interface, it embeds the Master interface and 178 // contains more core logic of a master 179 type BaseMaster interface { 180 Master 181 182 // MetaKVClient return business metastore kv client with job-level isolation 183 MetaKVClient() metaModel.KVClient 184 185 // MetricFactory return a promethus factory with some underlying labels(e.g. job-id, work-id) 186 MetricFactory() promutil.Factory 187 188 // Logger return a zap logger with some underlying fields(e.g. job-id) 189 Logger() *zap.Logger 190 191 // MasterMeta return the meta data of master 192 MasterMeta() *frameModel.MasterMeta 193 194 // GetWorkers return the handle of all workers, from which we can get the worker status、worker id and 195 // the method for sending message to specific worker 196 GetWorkers() map[frameModel.WorkerID]WorkerHandle 197 198 // IsMasterReady returns whether the master has received heartbeats for all 199 // workers after a fail-over. If this is the first time the JobMaster started up, 200 // the return value is always true. 201 IsMasterReady() bool 202 203 // Exit should be called when master (in user logic) wants to exit. 204 // exitReason: ExitReasonFinished/ExitReasonCanceled/ExitReasonFailed 205 // NOTE: Currently, no implement has used this method, but we still keep it to make the interface intact 206 Exit(ctx context.Context, exitReason ExitReason, err error, detail []byte) error 207 208 // CreateWorker is the latest version of CreateWorker, but with 209 // a more flexible way of passing options. 210 // If the worker needs to access certain file system resources, it must pass 211 // resource ID via CreateWorkerOpt 212 CreateWorker( 213 workerType frameModel.WorkerType, 214 config WorkerConfig, 215 opts ...CreateWorkerOpt, 216 ) (frameModel.WorkerID, error) 217 } 218 219 // DefaultBaseMaster implements BaseMaster interface 220 type DefaultBaseMaster struct { 221 Impl MasterImpl 222 223 // dependencies 224 messageHandlerManager p2p.MessageHandlerManager 225 messageSender p2p.MessageSender 226 // framework metastore client 227 frameMetaClient pkgOrm.Client 228 executorGroup client.ExecutorGroup 229 serverMasterClient client.ServerMasterClient 230 231 clock clock.Clock 232 233 // workerManager maintains the list of all workers and 234 // their statuses. 235 workerManager *master.WorkerManager 236 237 currentEpoch atomic.Int64 238 239 wg sync.WaitGroup 240 errCenter *errctx.ErrCenter 241 242 // closeCh is closed when the BaseMaster is exiting 243 closeCh chan struct{} 244 245 id frameModel.MasterID // id of this master itself 246 advertiseAddr string 247 nodeID p2p.NodeID 248 timeoutConfig config.TimeoutConfig 249 masterMeta *frameModel.MasterMeta 250 251 workerCreator *master.WorkerCreator 252 253 // workerProjectMap keep the <WorkerID, ProjectInfo> map 254 // It's only used by JobManager who has workers(jobmaster) with different project info 255 // [NOTICE]: When JobManager failover, we need to load all workers(jobmaster)'s project info 256 workerProjectMap sync.Map 257 // masterProjectInfo is the projectInfo of itself 258 masterProjectInfo tenant.ProjectInfo 259 260 // business kvclient with namespace 261 businessMetaKVClient metaModel.KVClient 262 263 // metricFactory can produce metric with underlying project info and job info 264 metricFactory promutil.Factory 265 266 // logger is the zap logger with underlying project info and job info 267 logger *zap.Logger 268 269 // components for easier unit testing 270 uuidGen uuid.Generator 271 272 // TODO use a shared quota for all masters. 273 createWorkerQuota quota.ConcurrencyQuota 274 275 // deps is a container for injected dependencies 276 deps *deps.Deps 277 } 278 279 // NotifyExit implements BaseWorker.NotifyExit 280 func (m *DefaultBaseMaster) NotifyExit(ctx context.Context, errIn error) error { 281 // no-op for now. 282 return nil 283 } 284 285 type masterParams struct { 286 dig.In 287 288 MessageHandlerManager p2p.MessageHandlerManager 289 MessageSender p2p.MessageSender 290 // framework metastore client 291 FrameMetaClient pkgOrm.Client 292 BusinessClientConn metaModel.ClientConn 293 ExecutorGroup client.ExecutorGroup 294 ServerMasterClient client.ServerMasterClient 295 } 296 297 // NewBaseMaster creates a new DefaultBaseMaster instance 298 func NewBaseMaster( 299 ctx *dcontext.Context, 300 impl MasterImpl, 301 id frameModel.MasterID, 302 tp frameModel.WorkerType, 303 ) BaseMaster { 304 var ( 305 nodeID p2p.NodeID 306 advertiseAddr string 307 masterMeta = &frameModel.MasterMeta{} 308 params masterParams 309 ) 310 if ctx != nil { 311 nodeID = ctx.Environ.NodeID 312 advertiseAddr = ctx.Environ.Addr 313 metaBytes := ctx.Environ.MasterMetaBytes 314 err := errors.Trace(masterMeta.Unmarshal(metaBytes)) 315 if err != nil { 316 log.Warn("invalid master meta", zap.ByteString("data", metaBytes), zap.Error(err)) 317 } 318 } 319 320 if err := ctx.Deps().Fill(¶ms); err != nil { 321 // TODO more elegant error handling 322 log.Panic("failed to provide dependencies", zap.Error(err)) 323 } 324 325 logger := logutil.FromContext(*ctx) 326 327 cli, err := meta.NewKVClientWithNamespace(params.BusinessClientConn, ctx.ProjectInfo.UniqueID(), id) 328 if err != nil { 329 // TODO more elegant error handling 330 log.Panic("failed to create business kvclient", zap.Error(err)) 331 } 332 333 return &DefaultBaseMaster{ 334 Impl: impl, 335 messageHandlerManager: params.MessageHandlerManager, 336 messageSender: params.MessageSender, 337 frameMetaClient: params.FrameMetaClient, 338 executorGroup: params.ExecutorGroup, 339 serverMasterClient: params.ServerMasterClient, 340 id: id, 341 clock: clock.New(), 342 343 timeoutConfig: config.DefaultTimeoutConfig(), 344 masterMeta: masterMeta, 345 346 closeCh: make(chan struct{}), 347 348 errCenter: errctx.NewErrCenter(), 349 350 uuidGen: uuid.NewGenerator(), 351 352 nodeID: nodeID, 353 advertiseAddr: advertiseAddr, 354 masterProjectInfo: ctx.ProjectInfo, 355 356 createWorkerQuota: quota.NewConcurrencyQuota(maxCreateWorkerConcurrency), 357 businessMetaKVClient: cli, 358 metricFactory: promutil.NewFactory4Master(ctx.ProjectInfo, MustConvertWorkerType2JobType(tp), id), 359 logger: frameLog.WithMasterID(logger, id), 360 361 deps: ctx.Deps(), 362 } 363 } 364 365 // MetaKVClient returns the business space metaclient 366 func (m *DefaultBaseMaster) MetaKVClient() metaModel.KVClient { 367 return m.businessMetaKVClient 368 } 369 370 // MetricFactory implements BaseMaster.MetricFactory 371 func (m *DefaultBaseMaster) MetricFactory() promutil.Factory { 372 return m.metricFactory 373 } 374 375 // Logger implements BaseMaster.Logger 376 func (m *DefaultBaseMaster) Logger() *zap.Logger { 377 return m.logger 378 } 379 380 // Init implements BaseMaster.Init 381 func (m *DefaultBaseMaster) Init(ctx context.Context) error { 382 // Note this context must not be held in any resident goroutine. 383 ctx, cancel := m.errCenter.WithCancelOnFirstError(ctx) 384 defer cancel() 385 386 isInit, err := m.doInit(ctx) 387 if err != nil { 388 return errors.Trace(err) 389 } 390 391 if isInit { 392 if err := m.Impl.InitImpl(ctx); err != nil { 393 m.errCenter.OnError(err) 394 return errors.Trace(err) 395 } 396 } else { 397 if err := m.Impl.OnMasterRecovered(ctx); err != nil { 398 m.errCenter.OnError(err) 399 return errors.Trace(err) 400 } 401 } 402 403 if err := m.markStateInMetadata(ctx, frameModel.MasterStateInit); err != nil { 404 return errors.Trace(err) 405 } 406 return nil 407 } 408 409 func (m *DefaultBaseMaster) doInit(ctx context.Context) (isFirstStartUp bool, err error) { 410 isInit, epoch, err := m.refreshMetadata(ctx) 411 if err != nil { 412 return false, errors.Trace(err) 413 } 414 m.currentEpoch.Store(epoch) 415 416 m.workerManager = master.NewWorkerManager( 417 m.id, 418 epoch, 419 m.frameMetaClient, 420 m.messageSender, 421 func(_ context.Context, handle master.WorkerHandle) error { 422 return m.Impl.OnWorkerOnline(handle) 423 }, 424 func(_ context.Context, handle master.WorkerHandle, err error) error { 425 return m.Impl.OnWorkerOffline(handle, err) 426 }, 427 func(_ context.Context, handle master.WorkerHandle) error { 428 return m.Impl.OnWorkerStatusUpdated(handle, handle.Status()) 429 }, 430 func(_ context.Context, handle master.WorkerHandle, err error) error { 431 return m.Impl.OnWorkerDispatched(handle, err) 432 }, isInit, m.timeoutConfig, m.clock). 433 WithLogger(m.logger) 434 435 inheritedSelectors := m.masterMeta.Ext.Selectors 436 workerCreator := master.NewWorkerCreatorBuilder(). 437 WithMasterID(m.id). 438 WithHooks(&master.WorkerCreationHooks{BeforeStartingWorker: m.workerManager.BeforeStartingWorker}). 439 WithExecutorGroup(m.executorGroup). 440 WithServerMasterClient(m.serverMasterClient). 441 WithFrameMetaClient(m.frameMetaClient). 442 WithLogger(m.Logger()). 443 WithInheritedSelectors(inheritedSelectors...). 444 Build() 445 m.workerCreator = workerCreator 446 447 if err := m.registerMessageHandlers(ctx); err != nil { 448 return false, errors.Trace(err) 449 } 450 451 if !isInit { 452 if err := m.workerManager.InitAfterRecover(ctx); err != nil { 453 return false, err 454 } 455 } 456 return isInit, nil 457 } 458 459 func (m *DefaultBaseMaster) registerMessageHandlers(ctx context.Context) error { 460 ok, err := m.messageHandlerManager.RegisterHandler( 461 ctx, 462 frameModel.HeartbeatPingTopic(m.id), 463 &frameModel.HeartbeatPingMessage{}, 464 func(sender p2p.NodeID, value p2p.MessageValue) error { 465 msg := value.(*frameModel.HeartbeatPingMessage) 466 m.Logger().Info("Heartbeat Ping received", 467 zap.Any("msg", msg), 468 zap.String("master-id", m.id)) 469 470 replyMsg := &frameModel.HeartbeatPongMessage{ 471 SendTime: msg.SendTime, 472 ReplyTime: m.clock.Now(), 473 ToWorkerID: msg.FromWorkerID, 474 Epoch: m.currentEpoch.Load(), 475 IsFinished: msg.IsFinished, 476 } 477 ok, err := m.messageSender.SendToNode( 478 ctx, 479 sender, 480 frameModel.HeartbeatPongTopic(m.id, msg.FromWorkerID), 481 replyMsg) 482 if err != nil { 483 return err 484 } 485 if !ok { 486 log.Warn("Sending Heartbeat Pong failed", 487 zap.Any("reply", replyMsg)) 488 return nil 489 } 490 m.workerManager.HandleHeartbeat(msg, sender) 491 return nil 492 }) 493 if err != nil { 494 return err 495 } 496 if !ok { 497 m.Logger().Panic("duplicate handler", zap.String("topic", frameModel.HeartbeatPingTopic(m.id))) 498 } 499 500 ok, err = m.messageHandlerManager.RegisterHandler( 501 ctx, 502 statusutil.WorkerStatusTopic(m.id), 503 &statusutil.WorkerStatusMessage{}, 504 func(sender p2p.NodeID, value p2p.MessageValue) error { 505 msg := value.(*statusutil.WorkerStatusMessage) 506 m.workerManager.OnWorkerStatusUpdateMessage(msg) 507 return nil 508 }) 509 if err != nil { 510 return err 511 } 512 if !ok { 513 m.Logger().Panic("duplicate handler", zap.String("topic", statusutil.WorkerStatusTopic(m.id))) 514 } 515 516 return nil 517 } 518 519 // Poll implements BaseMaster.Poll 520 func (m *DefaultBaseMaster) Poll(ctx context.Context) error { 521 ctx, cancel := m.errCenter.WithCancelOnFirstError(ctx) 522 defer cancel() 523 524 if err := m.doPoll(ctx); err != nil { 525 return errors.Trace(err) 526 } 527 528 if err := m.Impl.Tick(ctx); err != nil { 529 m.errCenter.OnError(err) 530 return errors.Trace(err) 531 } 532 533 return nil 534 } 535 536 func (m *DefaultBaseMaster) doPoll(ctx context.Context) error { 537 if err := m.errCenter.CheckError(); err != nil { 538 return err 539 } 540 541 select { 542 case <-m.closeCh: 543 return errors.ErrMasterClosed.GenWithStackByArgs() 544 default: 545 } 546 547 if err := m.messageHandlerManager.CheckError(ctx); err != nil { 548 return errors.Trace(err) 549 } 550 return m.workerManager.Tick(ctx) 551 } 552 553 // MasterMeta implements BaseMaster.MasterMeta 554 func (m *DefaultBaseMaster) MasterMeta() *frameModel.MasterMeta { 555 return m.masterMeta 556 } 557 558 // MasterID implements BaseMaster.MasterID 559 func (m *DefaultBaseMaster) MasterID() frameModel.MasterID { 560 return m.id 561 } 562 563 // GetWorkers implements BaseMaster.GetWorkers 564 func (m *DefaultBaseMaster) GetWorkers() map[frameModel.WorkerID]WorkerHandle { 565 return m.workerManager.GetWorkers() 566 } 567 568 func (m *DefaultBaseMaster) doClose() { 569 closeCtx, cancel := context.WithTimeout(context.Background(), time.Second*3) 570 defer cancel() 571 572 close(m.closeCh) 573 m.wg.Wait() 574 if err := m.messageHandlerManager.Clean(closeCtx); err != nil { 575 m.Logger().Warn("Failed to clean up message handlers", 576 zap.String("master-id", m.id), zap.Error(err)) 577 } 578 promutil.UnregisterWorkerMetrics(m.id) 579 m.businessMetaKVClient.Close() 580 } 581 582 // Close implements BaseMaster.Close 583 func (m *DefaultBaseMaster) Close(ctx context.Context) error { 584 m.Impl.CloseImpl(ctx) 585 586 m.persistMetaError() 587 m.doClose() 588 return nil 589 } 590 591 // Stop implements Master.Stop 592 func (m *DefaultBaseMaster) Stop(ctx context.Context) error { 593 m.Impl.StopImpl(ctx) 594 return nil 595 } 596 597 // refreshMetadata load and update metadata by current epoch, nodeID, advertiseAddr, etc. 598 // master meta is persisted before it is created, in this function we update some 599 // fileds to the current value, including epoch, nodeID and advertiseAddr. 600 func (m *DefaultBaseMaster) refreshMetadata(ctx context.Context) (isInit bool, epoch frameModel.Epoch, err error) { 601 metaClient := metadata.NewMasterMetadataClient(m.id, m.frameMetaClient) 602 603 masterMeta, err := metaClient.Load(ctx) 604 if err != nil { 605 return false, 0, err 606 } 607 608 epoch, err = m.frameMetaClient.GenEpoch(ctx) 609 if err != nil { 610 return false, 0, err 611 } 612 613 // We should update the master data to reflect our current information 614 masterMeta.Epoch = epoch 615 masterMeta.Addr = m.advertiseAddr 616 masterMeta.NodeID = m.nodeID 617 618 if err := metaClient.Update(ctx, masterMeta.RefreshValues()); err != nil { 619 return false, 0, errors.Trace(err) 620 } 621 622 m.masterMeta = masterMeta 623 // isInit true means the master is created but has not been initialized. 624 isInit = masterMeta.State == frameModel.MasterStateUninit 625 626 return 627 } 628 629 func (m *DefaultBaseMaster) markStateInMetadata( 630 ctx context.Context, code frameModel.MasterState, 631 ) error { 632 metaClient := metadata.NewMasterMetadataClient(m.id, m.frameMetaClient) 633 m.masterMeta.State = code 634 return metaClient.Update(ctx, m.masterMeta.UpdateStateValues()) 635 } 636 637 func (m *DefaultBaseMaster) persistMetaError() { 638 ctx, cancel := context.WithTimeout(context.Background(), time.Second*3) 639 defer cancel() 640 641 if err := m.errCenter.CheckError(); err != nil { 642 metaClient := metadata.NewMasterMetadataClient(m.id, m.frameMetaClient) 643 m.masterMeta.ErrorMsg = err.Error() 644 if err2 := metaClient.Update(ctx, m.masterMeta.UpdateErrorValues()); err2 != nil { 645 m.Logger().Warn("Failed to update error message", 646 zap.String("master-id", m.id), zap.Error(err2)) 647 } 648 } 649 } 650 651 // PrepareWorkerConfig extracts information from WorkerConfig into detail fields. 652 // - If workerType is master type, the config is a `*MasterMeta` struct and 653 // contains pre allocated maseter ID, and json marshalled config. 654 // - If workerType is worker type, the config is a user defined config struct, we 655 // marshal it to byte slice as returned config, and generate a random WorkerID. 656 func (m *DefaultBaseMaster) PrepareWorkerConfig( 657 workerType frameModel.WorkerType, config WorkerConfig, 658 ) (rawConfig []byte, workerID frameModel.WorkerID, err error) { 659 switch workerType { 660 case frameModel.CvsJobMaster, frameModel.FakeJobMaster, frameModel.DMJobMaster: 661 masterMeta, ok := config.(*frameModel.MasterMeta) 662 if !ok { 663 err = errors.ErrMasterInvalidMeta.GenWithStackByArgs(config) 664 return 665 } 666 rawConfig = masterMeta.Config 667 workerID = masterMeta.ID 668 case frameModel.WorkerDMDump, frameModel.WorkerDMLoad, frameModel.WorkerDMSync: 669 var b bytes.Buffer 670 err = toml.NewEncoder(&b).Encode(config) 671 if err != nil { 672 return 673 } 674 rawConfig = b.Bytes() 675 workerID = m.uuidGen.NewString() 676 default: 677 rawConfig, err = json.Marshal(config) 678 if err != nil { 679 return 680 } 681 workerID = m.uuidGen.NewString() 682 } 683 return 684 } 685 686 // CreateWorker implements BaseMaster.CreateWorker 687 func (m *DefaultBaseMaster) CreateWorker( 688 workerType frameModel.WorkerType, 689 config WorkerConfig, 690 opts ...CreateWorkerOpt, 691 ) (frameModel.WorkerID, error) { 692 m.Logger().Info("CreateWorker", 693 zap.Stringer("worker-type", workerType), 694 zap.Any("worker-config", config), 695 zap.String("master-id", m.id)) 696 697 rawConfig, workerID, err := m.PrepareWorkerConfig(workerType, config) 698 if err != nil { 699 return "", err 700 } 701 702 errCtx, cancel := m.errCenter.WithCancelOnFirstError(context.Background()) 703 defer cancel() 704 quotaCtx, cancel := context.WithTimeout(errCtx, createWorkerWaitQuotaTimeout) 705 defer cancel() 706 if err := m.createWorkerQuota.Consume(quotaCtx); err != nil { 707 return "", errors.WrapError(errors.ErrMasterConcurrencyExceeded, err) 708 } 709 710 go func() { 711 defer func() { 712 m.createWorkerQuota.Release() 713 }() 714 715 errCtx, cancelErrCtx := m.errCenter.WithCancelOnFirstError(context.Background()) 716 defer cancelErrCtx() 717 718 requestCtx, cancelRequestCtx := context.WithTimeout(errCtx, createWorkerTimeout) 719 defer cancelRequestCtx() 720 721 err := m.workerCreator.CreateWorker( 722 requestCtx, m.GetProjectInfo(workerID), workerType, workerID, rawConfig, 723 opts...) 724 if err != nil { 725 m.workerManager.AbortCreatingWorker(workerID, err) 726 } 727 }() 728 729 return workerID, nil 730 } 731 732 // IsMasterReady implements BaseMaster.IsMasterReady 733 func (m *DefaultBaseMaster) IsMasterReady() bool { 734 return m.workerManager.IsInitialized() 735 } 736 737 // Exit implements BaseMaster.Exit 738 // NOTE: Currently, no implement has used this method, but we still keep it to make the interface intact 739 func (m *DefaultBaseMaster) Exit(ctx context.Context, exitReason ExitReason, err error, detail []byte) error { 740 // Set the errCenter to prevent user from forgetting to return directly after calling 'Exit' 741 // keep the original error in errCenter if possible 742 defer func() { 743 if err == nil { 744 err = errors.ErrWorkerFinish.FastGenByArgs() 745 } 746 m.errCenter.OnError(err) 747 }() 748 749 return m.exitWithoutSetErrCenter(ctx, exitReason, err, detail) 750 } 751 752 func (m *DefaultBaseMaster) exitWithoutSetErrCenter(ctx context.Context, exitReason ExitReason, err error, detail []byte) (errRet error) { 753 switch exitReason { 754 case ExitReasonFinished: 755 m.masterMeta.State = frameModel.MasterStateFinished 756 case ExitReasonCanceled: 757 // TODO: replace stop with cancel 758 m.masterMeta.State = frameModel.MasterStateStopped 759 case ExitReasonFailed: 760 m.masterMeta.State = frameModel.MasterStateFailed 761 default: 762 m.masterMeta.State = frameModel.MasterStateFailed 763 } 764 765 if err != nil { 766 m.masterMeta.ErrorMsg = err.Error() 767 } else { 768 m.masterMeta.ErrorMsg = "" 769 } 770 m.masterMeta.Detail = detail 771 metaClient := metadata.NewMasterMetadataClient(m.id, m.frameMetaClient) 772 return metaClient.Update(ctx, m.masterMeta.ExitValues()) 773 } 774 775 // SetProjectInfo set the project info of specific worker 776 // [NOTICE]: Only used by JobManager to set project for different job(worker for jobmanager) 777 func (m *DefaultBaseMaster) SetProjectInfo(workerID frameModel.WorkerID, projectInfo tenant.ProjectInfo) { 778 m.workerProjectMap.Store(workerID, projectInfo) 779 } 780 781 // DeleteProjectInfo delete the project info of specific worker 782 // NOTICEL Only used by JobMananger when stop job 783 func (m *DefaultBaseMaster) DeleteProjectInfo(workerID frameModel.WorkerID) { 784 m.workerProjectMap.Delete(workerID) 785 } 786 787 // GetProjectInfo get the project info of the worker 788 // [WARN]: Once 'DeleteProjectInfo' is called, 'GetProjectInfo' may return unexpected project info 789 // For JobManager: It will set the <jobID, projectInfo> pair in advance. 790 // So if we call 'GetProjectInfo' before 'DeleteProjectInfo', we can expect a correct projectInfo. 791 // For JobMaster: Master and worker always have the same projectInfo and workerProjectMap is empty 792 func (m *DefaultBaseMaster) GetProjectInfo(masterID frameModel.MasterID) tenant.ProjectInfo { 793 projectInfo, exists := m.workerProjectMap.Load(masterID) 794 if !exists { 795 return m.masterProjectInfo 796 } 797 798 return projectInfo.(tenant.ProjectInfo) 799 } 800 801 // InitProjectInfosAfterRecover set project infos for all worker after master recover 802 // NOTICE: Only used by JobMananger when failover 803 func (m *DefaultBaseMaster) InitProjectInfosAfterRecover(jobs []*frameModel.MasterMeta) { 804 for _, meta := range jobs { 805 // TODO: fix the TenantID 806 m.workerProjectMap.Store(meta.ID, tenant.NewProjectInfo("", meta.ProjectID)) 807 } 808 }