github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/engine/framework/master.go (about)

     1  // Copyright 2022 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package framework
    15  
    16  import (
    17  	"bytes"
    18  	"context"
    19  	"encoding/json"
    20  	"sync"
    21  	"time"
    22  
    23  	"github.com/BurntSushi/toml"
    24  	"github.com/pingcap/log"
    25  	"github.com/pingcap/tiflow/engine/framework/config"
    26  	"github.com/pingcap/tiflow/engine/framework/internal/master"
    27  	frameLog "github.com/pingcap/tiflow/engine/framework/logutil"
    28  	"github.com/pingcap/tiflow/engine/framework/metadata"
    29  	frameModel "github.com/pingcap/tiflow/engine/framework/model"
    30  	"github.com/pingcap/tiflow/engine/framework/statusutil"
    31  	"github.com/pingcap/tiflow/engine/pkg/client"
    32  	"github.com/pingcap/tiflow/engine/pkg/clock"
    33  	dcontext "github.com/pingcap/tiflow/engine/pkg/context"
    34  	"github.com/pingcap/tiflow/engine/pkg/deps"
    35  	"github.com/pingcap/tiflow/engine/pkg/errctx"
    36  	resModel "github.com/pingcap/tiflow/engine/pkg/externalresource/model"
    37  	"github.com/pingcap/tiflow/engine/pkg/meta"
    38  	metaModel "github.com/pingcap/tiflow/engine/pkg/meta/model"
    39  	pkgOrm "github.com/pingcap/tiflow/engine/pkg/orm"
    40  	"github.com/pingcap/tiflow/engine/pkg/p2p"
    41  	"github.com/pingcap/tiflow/engine/pkg/promutil"
    42  	"github.com/pingcap/tiflow/engine/pkg/quota"
    43  	"github.com/pingcap/tiflow/engine/pkg/tenant"
    44  	"github.com/pingcap/tiflow/pkg/errors"
    45  	"github.com/pingcap/tiflow/pkg/label"
    46  	"github.com/pingcap/tiflow/pkg/logutil"
    47  	"github.com/pingcap/tiflow/pkg/uuid"
    48  	"go.uber.org/atomic"
    49  	"go.uber.org/dig"
    50  	"go.uber.org/zap"
    51  )
    52  
    53  // Master defines a basic interface that can run in dataflow engine runtime
    54  type Master interface {
    55  	Init(ctx context.Context) error
    56  	Poll(ctx context.Context) error
    57  	MasterID() frameModel.MasterID
    58  	Close(ctx context.Context) error
    59  	Stop(ctx context.Context) error
    60  	NotifyExit(ctx context.Context, errIn error) error
    61  }
    62  
    63  // MasterImpl defines the interface to implement a master, business logic can be
    64  // added in the functions of this interface
    65  type MasterImpl interface {
    66  	// InitImpl is called at the first time the MasterImpl instance is initialized
    67  	// after OnOpenAPIInitialized. When InitImpl returns without error, framework
    68  	// will try to persist an internal state so further failover will call OnMasterRecovered
    69  	// rather than InitImpl.
    70  	// Return:
    71  	// - error to let the framework call CloseImpl, and framework may retry InitImpl
    72  	//   later for some times. For non-retryable failure, business logic should
    73  	//   call Exit.
    74  	// Concurrent safety:
    75  	// - this function is not concurrent with other callbacks.
    76  	InitImpl(ctx context.Context) error
    77  
    78  	// OnMasterRecovered is called when the MasterImpl instance has failover from
    79  	// error by framework. For this MasterImpl instance, it's called after OnOpenAPIInitialized.
    80  	// Return:
    81  	// - error to let the framework call CloseImpl.
    82  	// Concurrent safety:
    83  	// - this function is not concurrent with other callbacks.
    84  	OnMasterRecovered(ctx context.Context) error
    85  
    86  	// Tick is called on a fixed interval after MasterImpl's InitImpl or OnMasterRecovered,
    87  	// business logic can do some periodic tasks here.
    88  	// Return:
    89  	// - error to let the framework call CloseImpl.
    90  	// Concurrent safety:
    91  	// - this function may be concurrently called with other callbacks except for
    92  	//   Tick itself, OnOpenAPIInitialized, InitImpl, OnMasterRecovered, CloseImpl,
    93  	//   StopImpl.
    94  	Tick(ctx context.Context) error
    95  
    96  	// OnWorkerDispatched is called when the asynchronized action of CreateWorker
    97  	// is finished. Only after OnWorkerDispatched, OnWorkerOnline and OnWorkerStatusUpdated
    98  	// of the same worker may be called.
    99  	// Return:
   100  	// - error to let the framework call CloseImpl.
   101  	// Concurrent safety:
   102  	// - this function may be concurrently called with another worker's OnWorkerXXX,
   103  	//   Tick, CloseImpl, StopImpl, OnCancel.
   104  	OnWorkerDispatched(worker WorkerHandle, result error) error
   105  
   106  	// OnWorkerOnline is called when the first heartbeat for a worker is received.
   107  	// NOTE: OnWorkerOffline can appear without OnWorkerOnline
   108  	// Return:
   109  	// - error to let the framework call CloseImpl.
   110  	// Concurrent safety:
   111  	// - this function may be concurrently called with another worker's OnWorkerXXX,
   112  	//   Tick, CloseImpl, StopImpl, OnCancel, the same worker's OnWorkerStatusUpdated.
   113  	OnWorkerOnline(worker WorkerHandle) error
   114  
   115  	// OnWorkerOffline is called as the consequence of worker's Exit or heartbeat
   116  	// timed out. It's the last callback function among OnWorkerXXX for a worker.
   117  	// Return:
   118  	// - error to let the framework call CloseImpl.
   119  	// Concurrent safety:
   120  	// - this function may be concurrently called with another worker's OnWorkerXXX,
   121  	//   Tick, CloseImpl, StopImpl, OnCancel.
   122  	OnWorkerOffline(worker WorkerHandle, reason error) error
   123  
   124  	// OnWorkerMessage is called when a customized message is received.
   125  	OnWorkerMessage(worker WorkerHandle, topic p2p.Topic, message interface{}) error
   126  
   127  	// OnWorkerStatusUpdated is called as the consequence of worker's UpdateStatus.
   128  	// Return:
   129  	// - error to let the framework call CloseImpl.
   130  	// Concurrent safety:
   131  	// - this function may be concurrently called with another worker's OnWorkerXXX,
   132  	//   Tick, CloseImpl, StopImpl, OnCancel, the same worker's OnWorkerOnline.
   133  	OnWorkerStatusUpdated(worker WorkerHandle, newStatus *frameModel.WorkerStatus) error
   134  
   135  	// CloseImpl is called as the consequence of returning error from InitImpl,
   136  	// OnMasterRecovered or Tick, the Tick will be stopped after entering this function.
   137  	// And framework may try to create a new masterImpl instance afterwards.
   138  	// Business logic is expected to release resources here, but business developer
   139  	// should be aware that when the runtime is crashed, CloseImpl has no time to
   140  	// be called.
   141  	// TODO: no other callbacks will be called after and concurrent with CloseImpl
   142  	// Concurrent safety:
   143  	// - this function may be concurrently called with OnWorkerMessage, OnCancel,
   144  	//   OnWorkerDispatched, OnWorkerOnline, OnWorkerOffline, OnWorkerStatusUpdated.
   145  	CloseImpl(ctx context.Context)
   146  
   147  	// StopImpl is called the consequence of business logic calls Exit. Tick will
   148  	// be stopped after entering this function, and framework will treat this MasterImpl
   149  	// as non-recoverable,
   150  	// There's at most one invocation to StopImpl after Exit. If the runtime is
   151  	// crashed, StopImpl has no time to be called.
   152  	// Concurrent safety:
   153  	// - this function may be concurrently called with OnWorkerMessage, OnCancel,
   154  	//   OnWorkerDispatched, OnWorkerOnline, OnWorkerOffline, OnWorkerStatusUpdated.
   155  	StopImpl(ctx context.Context)
   156  }
   157  
   158  const (
   159  	createWorkerWaitQuotaTimeout = 5 * time.Second
   160  	createWorkerTimeout          = 10 * time.Second
   161  	maxCreateWorkerConcurrency   = 100
   162  )
   163  
   164  // CreateWorkerOpt specifies an option for creating a worker.
   165  type CreateWorkerOpt = master.CreateWorkerOpt
   166  
   167  // CreateWorkerWithResourceRequirements specifies the resource requirement of a worker.
   168  func CreateWorkerWithResourceRequirements(resources ...resModel.ResourceID) CreateWorkerOpt {
   169  	return master.CreateWorkerWithResourceRequirements(resources...)
   170  }
   171  
   172  // CreateWorkerWithSelectors specifies the selectors used to dispatch the worker.
   173  func CreateWorkerWithSelectors(selectors ...*label.Selector) CreateWorkerOpt {
   174  	return master.CreateWorkerWithSelectors(selectors...)
   175  }
   176  
   177  // BaseMaster defines the master interface, it embeds the Master interface and
   178  // contains more core logic of a master
   179  type BaseMaster interface {
   180  	Master
   181  
   182  	// MetaKVClient return business metastore kv client with job-level isolation
   183  	MetaKVClient() metaModel.KVClient
   184  
   185  	// MetricFactory return a promethus factory with some underlying labels(e.g. job-id, work-id)
   186  	MetricFactory() promutil.Factory
   187  
   188  	// Logger return a zap logger with some underlying fields(e.g. job-id)
   189  	Logger() *zap.Logger
   190  
   191  	// MasterMeta return the meta data of master
   192  	MasterMeta() *frameModel.MasterMeta
   193  
   194  	// GetWorkers return the handle of all workers, from which we can get the worker status、worker id and
   195  	// the method for sending message to specific worker
   196  	GetWorkers() map[frameModel.WorkerID]WorkerHandle
   197  
   198  	// IsMasterReady returns whether the master has received heartbeats for all
   199  	// workers after a fail-over. If this is the first time the JobMaster started up,
   200  	// the return value is always true.
   201  	IsMasterReady() bool
   202  
   203  	// Exit should be called when master (in user logic) wants to exit.
   204  	// exitReason: ExitReasonFinished/ExitReasonCanceled/ExitReasonFailed
   205  	// NOTE: Currently, no implement has used this method, but we still keep it to make the interface intact
   206  	Exit(ctx context.Context, exitReason ExitReason, err error, detail []byte) error
   207  
   208  	// CreateWorker is the latest version of CreateWorker, but with
   209  	// a more flexible way of passing options.
   210  	// If the worker needs to access certain file system resources, it must pass
   211  	// resource ID via CreateWorkerOpt
   212  	CreateWorker(
   213  		workerType frameModel.WorkerType,
   214  		config WorkerConfig,
   215  		opts ...CreateWorkerOpt,
   216  	) (frameModel.WorkerID, error)
   217  }
   218  
   219  // DefaultBaseMaster implements BaseMaster interface
   220  type DefaultBaseMaster struct {
   221  	Impl MasterImpl
   222  
   223  	// dependencies
   224  	messageHandlerManager p2p.MessageHandlerManager
   225  	messageSender         p2p.MessageSender
   226  	// framework metastore client
   227  	frameMetaClient    pkgOrm.Client
   228  	executorGroup      client.ExecutorGroup
   229  	serverMasterClient client.ServerMasterClient
   230  
   231  	clock clock.Clock
   232  
   233  	// workerManager maintains the list of all workers and
   234  	// their statuses.
   235  	workerManager *master.WorkerManager
   236  
   237  	currentEpoch atomic.Int64
   238  
   239  	wg        sync.WaitGroup
   240  	errCenter *errctx.ErrCenter
   241  
   242  	// closeCh is closed when the BaseMaster is exiting
   243  	closeCh chan struct{}
   244  
   245  	id            frameModel.MasterID // id of this master itself
   246  	advertiseAddr string
   247  	nodeID        p2p.NodeID
   248  	timeoutConfig config.TimeoutConfig
   249  	masterMeta    *frameModel.MasterMeta
   250  
   251  	workerCreator *master.WorkerCreator
   252  
   253  	// workerProjectMap keep the <WorkerID, ProjectInfo> map
   254  	// It's only used by JobManager who has workers(jobmaster) with different project info
   255  	// [NOTICE]: When JobManager failover, we need to load all workers(jobmaster)'s project info
   256  	workerProjectMap sync.Map
   257  	// masterProjectInfo is the projectInfo of itself
   258  	masterProjectInfo tenant.ProjectInfo
   259  
   260  	// business kvclient with namespace
   261  	businessMetaKVClient metaModel.KVClient
   262  
   263  	// metricFactory can produce metric with underlying project info and job info
   264  	metricFactory promutil.Factory
   265  
   266  	// logger is the zap logger with underlying project info and job info
   267  	logger *zap.Logger
   268  
   269  	// components for easier unit testing
   270  	uuidGen uuid.Generator
   271  
   272  	// TODO use a shared quota for all masters.
   273  	createWorkerQuota quota.ConcurrencyQuota
   274  
   275  	// deps is a container for injected dependencies
   276  	deps *deps.Deps
   277  }
   278  
   279  // NotifyExit implements BaseWorker.NotifyExit
   280  func (m *DefaultBaseMaster) NotifyExit(ctx context.Context, errIn error) error {
   281  	// no-op for now.
   282  	return nil
   283  }
   284  
   285  type masterParams struct {
   286  	dig.In
   287  
   288  	MessageHandlerManager p2p.MessageHandlerManager
   289  	MessageSender         p2p.MessageSender
   290  	// framework metastore client
   291  	FrameMetaClient    pkgOrm.Client
   292  	BusinessClientConn metaModel.ClientConn
   293  	ExecutorGroup      client.ExecutorGroup
   294  	ServerMasterClient client.ServerMasterClient
   295  }
   296  
   297  // NewBaseMaster creates a new DefaultBaseMaster instance
   298  func NewBaseMaster(
   299  	ctx *dcontext.Context,
   300  	impl MasterImpl,
   301  	id frameModel.MasterID,
   302  	tp frameModel.WorkerType,
   303  ) BaseMaster {
   304  	var (
   305  		nodeID        p2p.NodeID
   306  		advertiseAddr string
   307  		masterMeta    = &frameModel.MasterMeta{}
   308  		params        masterParams
   309  	)
   310  	if ctx != nil {
   311  		nodeID = ctx.Environ.NodeID
   312  		advertiseAddr = ctx.Environ.Addr
   313  		metaBytes := ctx.Environ.MasterMetaBytes
   314  		err := errors.Trace(masterMeta.Unmarshal(metaBytes))
   315  		if err != nil {
   316  			log.Warn("invalid master meta", zap.ByteString("data", metaBytes), zap.Error(err))
   317  		}
   318  	}
   319  
   320  	if err := ctx.Deps().Fill(&params); err != nil {
   321  		// TODO more elegant error handling
   322  		log.Panic("failed to provide dependencies", zap.Error(err))
   323  	}
   324  
   325  	logger := logutil.FromContext(*ctx)
   326  
   327  	cli, err := meta.NewKVClientWithNamespace(params.BusinessClientConn, ctx.ProjectInfo.UniqueID(), id)
   328  	if err != nil {
   329  		// TODO more elegant error handling
   330  		log.Panic("failed to create business kvclient", zap.Error(err))
   331  	}
   332  
   333  	return &DefaultBaseMaster{
   334  		Impl:                  impl,
   335  		messageHandlerManager: params.MessageHandlerManager,
   336  		messageSender:         params.MessageSender,
   337  		frameMetaClient:       params.FrameMetaClient,
   338  		executorGroup:         params.ExecutorGroup,
   339  		serverMasterClient:    params.ServerMasterClient,
   340  		id:                    id,
   341  		clock:                 clock.New(),
   342  
   343  		timeoutConfig: config.DefaultTimeoutConfig(),
   344  		masterMeta:    masterMeta,
   345  
   346  		closeCh: make(chan struct{}),
   347  
   348  		errCenter: errctx.NewErrCenter(),
   349  
   350  		uuidGen: uuid.NewGenerator(),
   351  
   352  		nodeID:            nodeID,
   353  		advertiseAddr:     advertiseAddr,
   354  		masterProjectInfo: ctx.ProjectInfo,
   355  
   356  		createWorkerQuota:    quota.NewConcurrencyQuota(maxCreateWorkerConcurrency),
   357  		businessMetaKVClient: cli,
   358  		metricFactory:        promutil.NewFactory4Master(ctx.ProjectInfo, MustConvertWorkerType2JobType(tp), id),
   359  		logger:               frameLog.WithMasterID(logger, id),
   360  
   361  		deps: ctx.Deps(),
   362  	}
   363  }
   364  
   365  // MetaKVClient returns the business space metaclient
   366  func (m *DefaultBaseMaster) MetaKVClient() metaModel.KVClient {
   367  	return m.businessMetaKVClient
   368  }
   369  
   370  // MetricFactory implements BaseMaster.MetricFactory
   371  func (m *DefaultBaseMaster) MetricFactory() promutil.Factory {
   372  	return m.metricFactory
   373  }
   374  
   375  // Logger implements BaseMaster.Logger
   376  func (m *DefaultBaseMaster) Logger() *zap.Logger {
   377  	return m.logger
   378  }
   379  
   380  // Init implements BaseMaster.Init
   381  func (m *DefaultBaseMaster) Init(ctx context.Context) error {
   382  	// Note this context must not be held in any resident goroutine.
   383  	ctx, cancel := m.errCenter.WithCancelOnFirstError(ctx)
   384  	defer cancel()
   385  
   386  	isInit, err := m.doInit(ctx)
   387  	if err != nil {
   388  		return errors.Trace(err)
   389  	}
   390  
   391  	if isInit {
   392  		if err := m.Impl.InitImpl(ctx); err != nil {
   393  			m.errCenter.OnError(err)
   394  			return errors.Trace(err)
   395  		}
   396  	} else {
   397  		if err := m.Impl.OnMasterRecovered(ctx); err != nil {
   398  			m.errCenter.OnError(err)
   399  			return errors.Trace(err)
   400  		}
   401  	}
   402  
   403  	if err := m.markStateInMetadata(ctx, frameModel.MasterStateInit); err != nil {
   404  		return errors.Trace(err)
   405  	}
   406  	return nil
   407  }
   408  
   409  func (m *DefaultBaseMaster) doInit(ctx context.Context) (isFirstStartUp bool, err error) {
   410  	isInit, epoch, err := m.refreshMetadata(ctx)
   411  	if err != nil {
   412  		return false, errors.Trace(err)
   413  	}
   414  	m.currentEpoch.Store(epoch)
   415  
   416  	m.workerManager = master.NewWorkerManager(
   417  		m.id,
   418  		epoch,
   419  		m.frameMetaClient,
   420  		m.messageSender,
   421  		func(_ context.Context, handle master.WorkerHandle) error {
   422  			return m.Impl.OnWorkerOnline(handle)
   423  		},
   424  		func(_ context.Context, handle master.WorkerHandle, err error) error {
   425  			return m.Impl.OnWorkerOffline(handle, err)
   426  		},
   427  		func(_ context.Context, handle master.WorkerHandle) error {
   428  			return m.Impl.OnWorkerStatusUpdated(handle, handle.Status())
   429  		},
   430  		func(_ context.Context, handle master.WorkerHandle, err error) error {
   431  			return m.Impl.OnWorkerDispatched(handle, err)
   432  		}, isInit, m.timeoutConfig, m.clock).
   433  		WithLogger(m.logger)
   434  
   435  	inheritedSelectors := m.masterMeta.Ext.Selectors
   436  	workerCreator := master.NewWorkerCreatorBuilder().
   437  		WithMasterID(m.id).
   438  		WithHooks(&master.WorkerCreationHooks{BeforeStartingWorker: m.workerManager.BeforeStartingWorker}).
   439  		WithExecutorGroup(m.executorGroup).
   440  		WithServerMasterClient(m.serverMasterClient).
   441  		WithFrameMetaClient(m.frameMetaClient).
   442  		WithLogger(m.Logger()).
   443  		WithInheritedSelectors(inheritedSelectors...).
   444  		Build()
   445  	m.workerCreator = workerCreator
   446  
   447  	if err := m.registerMessageHandlers(ctx); err != nil {
   448  		return false, errors.Trace(err)
   449  	}
   450  
   451  	if !isInit {
   452  		if err := m.workerManager.InitAfterRecover(ctx); err != nil {
   453  			return false, err
   454  		}
   455  	}
   456  	return isInit, nil
   457  }
   458  
   459  func (m *DefaultBaseMaster) registerMessageHandlers(ctx context.Context) error {
   460  	ok, err := m.messageHandlerManager.RegisterHandler(
   461  		ctx,
   462  		frameModel.HeartbeatPingTopic(m.id),
   463  		&frameModel.HeartbeatPingMessage{},
   464  		func(sender p2p.NodeID, value p2p.MessageValue) error {
   465  			msg := value.(*frameModel.HeartbeatPingMessage)
   466  			m.Logger().Info("Heartbeat Ping received",
   467  				zap.Any("msg", msg),
   468  				zap.String("master-id", m.id))
   469  
   470  			replyMsg := &frameModel.HeartbeatPongMessage{
   471  				SendTime:   msg.SendTime,
   472  				ReplyTime:  m.clock.Now(),
   473  				ToWorkerID: msg.FromWorkerID,
   474  				Epoch:      m.currentEpoch.Load(),
   475  				IsFinished: msg.IsFinished,
   476  			}
   477  			ok, err := m.messageSender.SendToNode(
   478  				ctx,
   479  				sender,
   480  				frameModel.HeartbeatPongTopic(m.id, msg.FromWorkerID),
   481  				replyMsg)
   482  			if err != nil {
   483  				return err
   484  			}
   485  			if !ok {
   486  				log.Warn("Sending Heartbeat Pong failed",
   487  					zap.Any("reply", replyMsg))
   488  				return nil
   489  			}
   490  			m.workerManager.HandleHeartbeat(msg, sender)
   491  			return nil
   492  		})
   493  	if err != nil {
   494  		return err
   495  	}
   496  	if !ok {
   497  		m.Logger().Panic("duplicate handler", zap.String("topic", frameModel.HeartbeatPingTopic(m.id)))
   498  	}
   499  
   500  	ok, err = m.messageHandlerManager.RegisterHandler(
   501  		ctx,
   502  		statusutil.WorkerStatusTopic(m.id),
   503  		&statusutil.WorkerStatusMessage{},
   504  		func(sender p2p.NodeID, value p2p.MessageValue) error {
   505  			msg := value.(*statusutil.WorkerStatusMessage)
   506  			m.workerManager.OnWorkerStatusUpdateMessage(msg)
   507  			return nil
   508  		})
   509  	if err != nil {
   510  		return err
   511  	}
   512  	if !ok {
   513  		m.Logger().Panic("duplicate handler", zap.String("topic", statusutil.WorkerStatusTopic(m.id)))
   514  	}
   515  
   516  	return nil
   517  }
   518  
   519  // Poll implements BaseMaster.Poll
   520  func (m *DefaultBaseMaster) Poll(ctx context.Context) error {
   521  	ctx, cancel := m.errCenter.WithCancelOnFirstError(ctx)
   522  	defer cancel()
   523  
   524  	if err := m.doPoll(ctx); err != nil {
   525  		return errors.Trace(err)
   526  	}
   527  
   528  	if err := m.Impl.Tick(ctx); err != nil {
   529  		m.errCenter.OnError(err)
   530  		return errors.Trace(err)
   531  	}
   532  
   533  	return nil
   534  }
   535  
   536  func (m *DefaultBaseMaster) doPoll(ctx context.Context) error {
   537  	if err := m.errCenter.CheckError(); err != nil {
   538  		return err
   539  	}
   540  
   541  	select {
   542  	case <-m.closeCh:
   543  		return errors.ErrMasterClosed.GenWithStackByArgs()
   544  	default:
   545  	}
   546  
   547  	if err := m.messageHandlerManager.CheckError(ctx); err != nil {
   548  		return errors.Trace(err)
   549  	}
   550  	return m.workerManager.Tick(ctx)
   551  }
   552  
   553  // MasterMeta implements BaseMaster.MasterMeta
   554  func (m *DefaultBaseMaster) MasterMeta() *frameModel.MasterMeta {
   555  	return m.masterMeta
   556  }
   557  
   558  // MasterID implements BaseMaster.MasterID
   559  func (m *DefaultBaseMaster) MasterID() frameModel.MasterID {
   560  	return m.id
   561  }
   562  
   563  // GetWorkers implements BaseMaster.GetWorkers
   564  func (m *DefaultBaseMaster) GetWorkers() map[frameModel.WorkerID]WorkerHandle {
   565  	return m.workerManager.GetWorkers()
   566  }
   567  
   568  func (m *DefaultBaseMaster) doClose() {
   569  	closeCtx, cancel := context.WithTimeout(context.Background(), time.Second*3)
   570  	defer cancel()
   571  
   572  	close(m.closeCh)
   573  	m.wg.Wait()
   574  	if err := m.messageHandlerManager.Clean(closeCtx); err != nil {
   575  		m.Logger().Warn("Failed to clean up message handlers",
   576  			zap.String("master-id", m.id), zap.Error(err))
   577  	}
   578  	promutil.UnregisterWorkerMetrics(m.id)
   579  	m.businessMetaKVClient.Close()
   580  }
   581  
   582  // Close implements BaseMaster.Close
   583  func (m *DefaultBaseMaster) Close(ctx context.Context) error {
   584  	m.Impl.CloseImpl(ctx)
   585  
   586  	m.persistMetaError()
   587  	m.doClose()
   588  	return nil
   589  }
   590  
   591  // Stop implements Master.Stop
   592  func (m *DefaultBaseMaster) Stop(ctx context.Context) error {
   593  	m.Impl.StopImpl(ctx)
   594  	return nil
   595  }
   596  
   597  // refreshMetadata load and update metadata by current epoch, nodeID, advertiseAddr, etc.
   598  // master meta is persisted before it is created, in this function we update some
   599  // fileds to the current value, including epoch, nodeID and advertiseAddr.
   600  func (m *DefaultBaseMaster) refreshMetadata(ctx context.Context) (isInit bool, epoch frameModel.Epoch, err error) {
   601  	metaClient := metadata.NewMasterMetadataClient(m.id, m.frameMetaClient)
   602  
   603  	masterMeta, err := metaClient.Load(ctx)
   604  	if err != nil {
   605  		return false, 0, err
   606  	}
   607  
   608  	epoch, err = m.frameMetaClient.GenEpoch(ctx)
   609  	if err != nil {
   610  		return false, 0, err
   611  	}
   612  
   613  	// We should update the master data to reflect our current information
   614  	masterMeta.Epoch = epoch
   615  	masterMeta.Addr = m.advertiseAddr
   616  	masterMeta.NodeID = m.nodeID
   617  
   618  	if err := metaClient.Update(ctx, masterMeta.RefreshValues()); err != nil {
   619  		return false, 0, errors.Trace(err)
   620  	}
   621  
   622  	m.masterMeta = masterMeta
   623  	// isInit true means the master is created but has not been initialized.
   624  	isInit = masterMeta.State == frameModel.MasterStateUninit
   625  
   626  	return
   627  }
   628  
   629  func (m *DefaultBaseMaster) markStateInMetadata(
   630  	ctx context.Context, code frameModel.MasterState,
   631  ) error {
   632  	metaClient := metadata.NewMasterMetadataClient(m.id, m.frameMetaClient)
   633  	m.masterMeta.State = code
   634  	return metaClient.Update(ctx, m.masterMeta.UpdateStateValues())
   635  }
   636  
   637  func (m *DefaultBaseMaster) persistMetaError() {
   638  	ctx, cancel := context.WithTimeout(context.Background(), time.Second*3)
   639  	defer cancel()
   640  
   641  	if err := m.errCenter.CheckError(); err != nil {
   642  		metaClient := metadata.NewMasterMetadataClient(m.id, m.frameMetaClient)
   643  		m.masterMeta.ErrorMsg = err.Error()
   644  		if err2 := metaClient.Update(ctx, m.masterMeta.UpdateErrorValues()); err2 != nil {
   645  			m.Logger().Warn("Failed to update error message",
   646  				zap.String("master-id", m.id), zap.Error(err2))
   647  		}
   648  	}
   649  }
   650  
   651  // PrepareWorkerConfig extracts information from WorkerConfig into detail fields.
   652  //   - If workerType is master type, the config is a `*MasterMeta` struct and
   653  //     contains pre allocated maseter ID, and json marshalled config.
   654  //   - If workerType is worker type, the config is a user defined config struct, we
   655  //     marshal it to byte slice as returned config, and generate a random WorkerID.
   656  func (m *DefaultBaseMaster) PrepareWorkerConfig(
   657  	workerType frameModel.WorkerType, config WorkerConfig,
   658  ) (rawConfig []byte, workerID frameModel.WorkerID, err error) {
   659  	switch workerType {
   660  	case frameModel.CvsJobMaster, frameModel.FakeJobMaster, frameModel.DMJobMaster:
   661  		masterMeta, ok := config.(*frameModel.MasterMeta)
   662  		if !ok {
   663  			err = errors.ErrMasterInvalidMeta.GenWithStackByArgs(config)
   664  			return
   665  		}
   666  		rawConfig = masterMeta.Config
   667  		workerID = masterMeta.ID
   668  	case frameModel.WorkerDMDump, frameModel.WorkerDMLoad, frameModel.WorkerDMSync:
   669  		var b bytes.Buffer
   670  		err = toml.NewEncoder(&b).Encode(config)
   671  		if err != nil {
   672  			return
   673  		}
   674  		rawConfig = b.Bytes()
   675  		workerID = m.uuidGen.NewString()
   676  	default:
   677  		rawConfig, err = json.Marshal(config)
   678  		if err != nil {
   679  			return
   680  		}
   681  		workerID = m.uuidGen.NewString()
   682  	}
   683  	return
   684  }
   685  
   686  // CreateWorker implements BaseMaster.CreateWorker
   687  func (m *DefaultBaseMaster) CreateWorker(
   688  	workerType frameModel.WorkerType,
   689  	config WorkerConfig,
   690  	opts ...CreateWorkerOpt,
   691  ) (frameModel.WorkerID, error) {
   692  	m.Logger().Info("CreateWorker",
   693  		zap.Stringer("worker-type", workerType),
   694  		zap.Any("worker-config", config),
   695  		zap.String("master-id", m.id))
   696  
   697  	rawConfig, workerID, err := m.PrepareWorkerConfig(workerType, config)
   698  	if err != nil {
   699  		return "", err
   700  	}
   701  
   702  	errCtx, cancel := m.errCenter.WithCancelOnFirstError(context.Background())
   703  	defer cancel()
   704  	quotaCtx, cancel := context.WithTimeout(errCtx, createWorkerWaitQuotaTimeout)
   705  	defer cancel()
   706  	if err := m.createWorkerQuota.Consume(quotaCtx); err != nil {
   707  		return "", errors.WrapError(errors.ErrMasterConcurrencyExceeded, err)
   708  	}
   709  
   710  	go func() {
   711  		defer func() {
   712  			m.createWorkerQuota.Release()
   713  		}()
   714  
   715  		errCtx, cancelErrCtx := m.errCenter.WithCancelOnFirstError(context.Background())
   716  		defer cancelErrCtx()
   717  
   718  		requestCtx, cancelRequestCtx := context.WithTimeout(errCtx, createWorkerTimeout)
   719  		defer cancelRequestCtx()
   720  
   721  		err := m.workerCreator.CreateWorker(
   722  			requestCtx, m.GetProjectInfo(workerID), workerType, workerID, rawConfig,
   723  			opts...)
   724  		if err != nil {
   725  			m.workerManager.AbortCreatingWorker(workerID, err)
   726  		}
   727  	}()
   728  
   729  	return workerID, nil
   730  }
   731  
   732  // IsMasterReady implements BaseMaster.IsMasterReady
   733  func (m *DefaultBaseMaster) IsMasterReady() bool {
   734  	return m.workerManager.IsInitialized()
   735  }
   736  
   737  // Exit implements BaseMaster.Exit
   738  // NOTE: Currently, no implement has used this method, but we still keep it to make the interface intact
   739  func (m *DefaultBaseMaster) Exit(ctx context.Context, exitReason ExitReason, err error, detail []byte) error {
   740  	// Set the errCenter to prevent user from forgetting to return directly after calling 'Exit'
   741  	// keep the original error in errCenter if possible
   742  	defer func() {
   743  		if err == nil {
   744  			err = errors.ErrWorkerFinish.FastGenByArgs()
   745  		}
   746  		m.errCenter.OnError(err)
   747  	}()
   748  
   749  	return m.exitWithoutSetErrCenter(ctx, exitReason, err, detail)
   750  }
   751  
   752  func (m *DefaultBaseMaster) exitWithoutSetErrCenter(ctx context.Context, exitReason ExitReason, err error, detail []byte) (errRet error) {
   753  	switch exitReason {
   754  	case ExitReasonFinished:
   755  		m.masterMeta.State = frameModel.MasterStateFinished
   756  	case ExitReasonCanceled:
   757  		// TODO: replace stop with cancel
   758  		m.masterMeta.State = frameModel.MasterStateStopped
   759  	case ExitReasonFailed:
   760  		m.masterMeta.State = frameModel.MasterStateFailed
   761  	default:
   762  		m.masterMeta.State = frameModel.MasterStateFailed
   763  	}
   764  
   765  	if err != nil {
   766  		m.masterMeta.ErrorMsg = err.Error()
   767  	} else {
   768  		m.masterMeta.ErrorMsg = ""
   769  	}
   770  	m.masterMeta.Detail = detail
   771  	metaClient := metadata.NewMasterMetadataClient(m.id, m.frameMetaClient)
   772  	return metaClient.Update(ctx, m.masterMeta.ExitValues())
   773  }
   774  
   775  // SetProjectInfo set the project info of specific worker
   776  // [NOTICE]: Only used by JobManager to set project for different job(worker for jobmanager)
   777  func (m *DefaultBaseMaster) SetProjectInfo(workerID frameModel.WorkerID, projectInfo tenant.ProjectInfo) {
   778  	m.workerProjectMap.Store(workerID, projectInfo)
   779  }
   780  
   781  // DeleteProjectInfo delete the project info of specific worker
   782  // NOTICEL Only used by JobMananger when stop job
   783  func (m *DefaultBaseMaster) DeleteProjectInfo(workerID frameModel.WorkerID) {
   784  	m.workerProjectMap.Delete(workerID)
   785  }
   786  
   787  // GetProjectInfo get the project info of the worker
   788  // [WARN]: Once 'DeleteProjectInfo' is called, 'GetProjectInfo' may return unexpected project info
   789  // For JobManager: It will set the <jobID, projectInfo> pair in advance.
   790  // So if we call 'GetProjectInfo' before 'DeleteProjectInfo', we can expect a correct projectInfo.
   791  // For JobMaster: Master and worker always have the same projectInfo and workerProjectMap is empty
   792  func (m *DefaultBaseMaster) GetProjectInfo(masterID frameModel.MasterID) tenant.ProjectInfo {
   793  	projectInfo, exists := m.workerProjectMap.Load(masterID)
   794  	if !exists {
   795  		return m.masterProjectInfo
   796  	}
   797  
   798  	return projectInfo.(tenant.ProjectInfo)
   799  }
   800  
   801  // InitProjectInfosAfterRecover set project infos for all worker after master recover
   802  // NOTICE: Only used by JobMananger when failover
   803  func (m *DefaultBaseMaster) InitProjectInfosAfterRecover(jobs []*frameModel.MasterMeta) {
   804  	for _, meta := range jobs {
   805  		// TODO: fix the TenantID
   806  		m.workerProjectMap.Store(meta.ID, tenant.NewProjectInfo("", meta.ProjectID))
   807  	}
   808  }