github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/cdc/processor/manager.go

github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/cdc/processor/manager.go (about)

     1  // Copyright 2021 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package processor
    15  
    16  import (
    17  	"context"
    18  	"fmt"
    19  	"io"
    20  	"time"
    21  
    22  	"github.com/pingcap/errors"
    23  	"github.com/pingcap/failpoint"
    24  	"github.com/pingcap/log"
    25  	"github.com/pingcap/tiflow/cdc/model"
    26  	"github.com/pingcap/tiflow/cdc/vars"
    27  	"github.com/pingcap/tiflow/pkg/config"
    28  	cerror "github.com/pingcap/tiflow/pkg/errors"
    29  	"github.com/pingcap/tiflow/pkg/etcd"
    30  	"github.com/pingcap/tiflow/pkg/orchestrator"
    31  	"github.com/pingcap/tiflow/pkg/upstream"
    32  	"github.com/prometheus/client_golang/prometheus"
    33  	"go.uber.org/zap"
    34  )
    35  
    36  type commandTp int
    37  
    38  const (
    39  	commandTpUnknown commandTp = iota
    40  	commandTpWriteDebugInfo
    41  	processorLogsWarnDuration = 1 * time.Second
    42  )
    43  
    44  type command struct {
    45  	tp      commandTp
    46  	payload interface{}
    47  	done    chan<- error
    48  }
    49  
    50  // Manager is a manager of processor, which maintains the state and behavior of processors
    51  type Manager interface {
    52  	orchestrator.Reactor
    53  
    54  	// Close the manager itself and all processors. Can't be called with `Tick` concurrently.
    55  	// After it's called, all other methods shouldn't be called any more.
    56  	Close()
    57  
    58  	WriteDebugInfo(ctx context.Context, w io.Writer, done chan<- error)
    59  }
    60  
    61  // managerImpl is a manager of processor, which maintains the state and behavior of processors
    62  type managerImpl struct {
    63  	captureInfo     *model.CaptureInfo
    64  	liveness        *model.Liveness
    65  	processors      map[model.ChangeFeedID]*processor
    66  	commandQueue    chan *command
    67  	upstreamManager *upstream.Manager
    68  
    69  	newProcessor func(
    70  		*model.ChangeFeedInfo,
    71  		*model.ChangeFeedStatus,
    72  		*model.CaptureInfo,
    73  		model.ChangeFeedID,
    74  		*upstream.Upstream,
    75  		*model.Liveness,
    76  		uint64,
    77  		*config.SchedulerConfig,
    78  		etcd.OwnerCaptureInfoClient,
    79  		*vars.GlobalVars,
    80  	) *processor
    81  	cfg        *config.SchedulerConfig
    82  	globalVars *vars.GlobalVars
    83  
    84  	metricProcessorCloseDuration prometheus.Observer
    85  }
    86  
    87  // NewManager creates a new processor manager
    88  func NewManager(
    89  	captureInfo *model.CaptureInfo,
    90  	upstreamManager *upstream.Manager,
    91  	liveness *model.Liveness,
    92  	cfg *config.SchedulerConfig,
    93  	globalVars *vars.GlobalVars,
    94  ) Manager {
    95  	return &managerImpl{
    96  		captureInfo:                  captureInfo,
    97  		liveness:                     liveness,
    98  		processors:                   make(map[model.ChangeFeedID]*processor),
    99  		commandQueue:                 make(chan *command, 4),
   100  		upstreamManager:              upstreamManager,
   101  		newProcessor:                 NewProcessor,
   102  		metricProcessorCloseDuration: processorCloseDuration,
   103  		cfg:                          cfg,
   104  		globalVars:                   globalVars,
   105  	}
   106  }
   107  
   108  // Tick implements the `orchestrator.State` interface
   109  // the `state` parameter is sent by the etcd worker, the `state` must be a snapshot of KVs in etcd
   110  // the Tick function of Manager create or remove processor instances according to the specified `state`, or pass the `state` to processor instances
   111  func (m *managerImpl) Tick(stdCtx context.Context, state orchestrator.ReactorState) (nextState orchestrator.ReactorState, err error) {
   112  	globalState := state.(*orchestrator.GlobalReactorState)
   113  	m.handleCommand()
   114  
   115  	var inactiveChangefeedCount int
   116  	for changefeedID, changefeedState := range globalState.Changefeeds {
   117  		if !changefeedState.Active(m.captureInfo.ID) {
   118  			inactiveChangefeedCount++
   119  			m.closeProcessor(changefeedID)
   120  			continue
   121  		}
   122  		currentChangefeedEpoch := changefeedState.Info.Epoch
   123  		p, exist := m.processors[changefeedID]
   124  		if !exist {
   125  			up, ok := m.upstreamManager.Get(changefeedState.Info.UpstreamID)
   126  			if !ok {
   127  				upstreamInfo := globalState.Upstreams[changefeedState.Info.UpstreamID]
   128  				up = m.upstreamManager.AddUpstream(upstreamInfo)
   129  			}
   130  			failpoint.Inject("processorManagerHandleNewChangefeedDelay", nil)
   131  
   132  			cfg := *m.cfg
   133  			cfg.ChangefeedSettings = changefeedState.Info.Config.Scheduler
   134  			p = m.newProcessor(
   135  				changefeedState.Info, changefeedState.Status,
   136  				m.captureInfo, changefeedID, up, m.liveness,
   137  				currentChangefeedEpoch, &cfg, m.globalVars.EtcdClient,
   138  				m.globalVars)
   139  			m.processors[changefeedID] = p
   140  		}
   141  		if currentChangefeedEpoch != p.changefeedEpoch {
   142  			// Changefeed has restarted due to error, the processor is stale.
   143  			m.closeProcessor(changefeedID)
   144  			continue
   145  		}
   146  		// check if the changefeed is normal before tick
   147  		if !checkChangefeedNormal(changefeedState) {
   148  			patchProcessorErr(p.captureInfo, changefeedState,
   149  				cerror.ErrAdminStopProcessor.GenWithStackByArgs())
   150  			m.closeProcessor(changefeedID)
   151  			continue
   152  		}
   153  		// check the capture is alive
   154  		changefeedState.CheckCaptureAlive(p.captureInfo.ID)
   155  		// check if the task position is created
   156  		if createTaskPosition(changefeedState, p.captureInfo) {
   157  			continue
   158  		}
   159  		err, warning := p.Tick(stdCtx, changefeedState.Info, changefeedState.Status)
   160  		if warning != nil {
   161  			patchProcessorWarning(p.captureInfo, changefeedState, warning)
   162  		}
   163  		if err != nil {
   164  			patchProcessorErr(p.captureInfo, changefeedState, err)
   165  			// patchProcessorErr have already patched its error to tell the owner
   166  			// manager can just close the processor and continue to tick other processors
   167  			m.closeProcessor(changefeedID)
   168  		}
   169  	}
   170  	// check if the processors in memory is leaked
   171  	if len(globalState.Changefeeds)-inactiveChangefeedCount != len(m.processors) {
   172  		for changefeedID := range m.processors {
   173  			if _, exist := globalState.Changefeeds[changefeedID]; !exist {
   174  				m.closeProcessor(changefeedID)
   175  			}
   176  		}
   177  	}
   178  
   179  	if err := m.upstreamManager.Tick(stdCtx, globalState); err != nil {
   180  		return state, errors.Trace(err)
   181  	}
   182  	return state, nil
   183  }
   184  
   185  // checkChangefeedNormal checks if the changefeed is runnable.
   186  func checkChangefeedNormal(changefeed *orchestrator.ChangefeedReactorState) bool {
   187  	// check the state in this tick, make sure that the admin job type of the changefeed is not stopped
   188  	if changefeed.Info.AdminJobType.IsStopState() || changefeed.Status.AdminJobType.IsStopState() {
   189  		return false
   190  	}
   191  	// add a patch to check the changefeed is runnable when applying the patches in the etcd worker.
   192  	changefeed.CheckChangefeedNormal()
   193  	return true
   194  }
   195  
   196  // createTaskPosition will create a new task position if a task position does not exist.
   197  // task position not exist only when the processor is running first in the first tick.
   198  func createTaskPosition(changefeed *orchestrator.ChangefeedReactorState,
   199  	captureInfo *model.CaptureInfo,
   200  ) (skipThisTick bool) {
   201  	if _, exist := changefeed.TaskPositions[captureInfo.ID]; exist {
   202  		return false
   203  	}
   204  	changefeed.PatchTaskPosition(captureInfo.ID,
   205  		func(position *model.TaskPosition) (*model.TaskPosition, bool, error) {
   206  			if position == nil {
   207  				return &model.TaskPosition{}, true, nil
   208  			}
   209  			return position, false, nil
   210  		})
   211  	return true
   212  }
   213  
   214  func patchProcessorErr(captureInfo *model.CaptureInfo,
   215  	changefeed *orchestrator.ChangefeedReactorState,
   216  	err error,
   217  ) {
   218  	if isProcessorIgnorableError(err) {
   219  		log.Info("processor exited",
   220  			zap.String("capture", captureInfo.ID),
   221  			zap.String("namespace", changefeed.ID.Namespace),
   222  			zap.String("changefeed", changefeed.ID.ID),
   223  			zap.Error(err))
   224  		return
   225  	}
   226  	// record error information in etcd
   227  	var code string
   228  	if rfcCode, ok := cerror.RFCCode(err); ok {
   229  		code = string(rfcCode)
   230  	} else {
   231  		code = string(cerror.ErrProcessorUnknown.RFCCode())
   232  	}
   233  	changefeed.PatchTaskPosition(captureInfo.ID,
   234  		func(position *model.TaskPosition) (*model.TaskPosition, bool, error) {
   235  			if position == nil {
   236  				position = &model.TaskPosition{}
   237  			}
   238  			position.Error = &model.RunningError{
   239  				Time:    time.Now(),
   240  				Addr:    captureInfo.AdvertiseAddr,
   241  				Code:    code,
   242  				Message: err.Error(),
   243  			}
   244  			return position, true, nil
   245  		})
   246  	log.Error("run processor failed",
   247  		zap.String("capture", captureInfo.ID),
   248  		zap.String("namespace", changefeed.ID.Namespace),
   249  		zap.String("changefeed", changefeed.ID.ID),
   250  		zap.Error(err))
   251  }
   252  
   253  func patchProcessorWarning(captureInfo *model.CaptureInfo,
   254  	changefeed *orchestrator.ChangefeedReactorState, err error,
   255  ) {
   256  	if err == nil {
   257  		return
   258  	}
   259  	var code string
   260  	if rfcCode, ok := cerror.RFCCode(err); ok {
   261  		code = string(rfcCode)
   262  	} else {
   263  		code = string(cerror.ErrProcessorUnknown.RFCCode())
   264  	}
   265  	changefeed.PatchTaskPosition(captureInfo.ID,
   266  		func(position *model.TaskPosition) (*model.TaskPosition, bool, error) {
   267  			if position == nil {
   268  				position = &model.TaskPosition{}
   269  			}
   270  			position.Warning = &model.RunningError{
   271  				Time:    time.Now(),
   272  				Addr:    captureInfo.AdvertiseAddr,
   273  				Code:    code,
   274  				Message: err.Error(),
   275  			}
   276  			return position, true, nil
   277  		})
   278  }
   279  
   280  func (m *managerImpl) closeProcessor(changefeedID model.ChangeFeedID) {
   281  	processor, exist := m.processors[changefeedID]
   282  	if exist {
   283  		startTime := time.Now()
   284  		err := processor.Close()
   285  		costTime := time.Since(startTime)
   286  		if costTime > processorLogsWarnDuration {
   287  			log.Warn("processor close took too long",
   288  				zap.String("namespace", changefeedID.Namespace),
   289  				zap.String("changefeed", changefeedID.ID),
   290  				zap.String("capture", m.captureInfo.ID),
   291  				zap.Duration("duration", costTime))
   292  		}
   293  		m.metricProcessorCloseDuration.Observe(costTime.Seconds())
   294  		if err != nil {
   295  			log.Warn("failed to close processor",
   296  				zap.String("namespace", changefeedID.Namespace),
   297  				zap.String("changefeed", changefeedID.ID),
   298  				zap.Error(err))
   299  		}
   300  		delete(m.processors, changefeedID)
   301  	}
   302  }
   303  
   304  // Close the manager itself and all processors.
   305  // Note: This method must not be called with `Tick`. Please be careful.
   306  func (m *managerImpl) Close() {
   307  	log.Info("processor.Manager is closing")
   308  	for changefeedID := range m.processors {
   309  		m.closeProcessor(changefeedID)
   310  	}
   311  	// FIXME: we should drain command queue and signal callers an error.
   312  }
   313  
   314  // WriteDebugInfo write the debug info to Writer
   315  func (m *managerImpl) WriteDebugInfo(
   316  	ctx context.Context, w io.Writer, done chan<- error,
   317  ) {
   318  	err := m.sendCommand(ctx, commandTpWriteDebugInfo, w, done)
   319  	if err != nil {
   320  		log.Warn("send command commandTpWriteDebugInfo failed", zap.Error(err))
   321  	}
   322  }
   323  
   324  // sendCommands sends command to manager.
   325  // `done` is closed upon command completion or sendCommand returns error.
   326  func (m *managerImpl) sendCommand(
   327  	ctx context.Context, tp commandTp, payload interface{}, done chan<- error,
   328  ) error {
   329  	cmd := &command{tp: tp, payload: payload, done: done}
   330  	select {
   331  	case <-ctx.Done():
   332  		close(done)
   333  		return errors.Trace(ctx.Err())
   334  	case m.commandQueue <- cmd:
   335  		// FIXME: signal EtcdWorker to handle commands ASAP.
   336  	}
   337  	return nil
   338  }
   339  
   340  func (m *managerImpl) handleCommand() {
   341  	var cmd *command
   342  	select {
   343  	case cmd = <-m.commandQueue:
   344  	default:
   345  		return
   346  	}
   347  	defer close(cmd.done)
   348  	switch cmd.tp {
   349  	case commandTpWriteDebugInfo:
   350  		w := cmd.payload.(io.Writer)
   351  		err := m.writeDebugInfo(w)
   352  		if err != nil {
   353  			cmd.done <- err
   354  		}
   355  	default:
   356  		log.Warn("Unknown command in processor manager", zap.Any("command", cmd))
   357  	}
   358  }
   359  
   360  func (m *managerImpl) writeDebugInfo(w io.Writer) error {
   361  	for changefeedID, processor := range m.processors {
   362  		fmt.Fprintf(w, "changefeedID: %s\n", changefeedID)
   363  		err := processor.WriteDebugInfo(w)
   364  		if err != nil {
   365  			return errors.Trace(err)
   366  		}
   367  		fmt.Fprintf(w, "\n")
   368  	}
   369  
   370  	return nil
   371  }