github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/unit/unit.go (about)

     1  // Copyright 2019 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package unit
    15  
    16  import (
    17  	"context"
    18  	"strings"
    19  	"time"
    20  
    21  	"github.com/pingcap/errors"
    22  	"github.com/pingcap/tiflow/dm/config"
    23  	"github.com/pingcap/tiflow/dm/pb"
    24  	"github.com/pingcap/tiflow/dm/pkg/binlog"
    25  	"github.com/pingcap/tiflow/dm/pkg/retry"
    26  	"github.com/pingcap/tiflow/dm/pkg/terror"
    27  )
    28  
    29  const (
    30  	// DefaultInitTimeout represents the default timeout value when initializing a process unit.
    31  	DefaultInitTimeout = time.Minute
    32  )
    33  
    34  // Unit defines interface for subtask process units, like syncer, loader, relay, etc.
    35  // The Unit is not responsible to maintain its status like "pausing"/"paused". The caller should maintain the status,
    36  // for example, know the Unit is "paused" and avoid call Pause again.
    37  // All method is Unit interface can expect no concurrent invocation, the caller should guarantee this.
    38  type Unit interface {
    39  	// Init initializes the dm process unit
    40  	// every unit does base initialization in `Init`, and this must pass before start running the subtask
    41  	// other setups can be done in the beginning of `Process`, but this should be treated carefully to make it
    42  	// compatible with Pause / Resume.
    43  	// if initialing successfully, the outer caller should call `Close` when the unit (or the task) finished, stopped or canceled (because other units Init fail).
    44  	// if initialing fail, Init itself should release resources it acquired before (rolling itself back).
    45  	Init(ctx context.Context) error
    46  	// Process does the main logic and its returning must send a result to pr channel.
    47  	// When ctx.Done, stops the process and returns, otherwise the DM-worker will be blocked forever
    48  	// When not in processing, call Process to continue or resume the process
    49  	Process(ctx context.Context, pr chan pb.ProcessResult)
    50  	// Close shuts down the process and closes the unit, after that can not call Process to resume
    51  	// The implementation should not block for a long time.
    52  	Close()
    53  	// Kill shuts down the process and closes the unit without graceful.
    54  	Kill()
    55  	// Pause does some cleanups and the unit can be resumed later. The caller will make sure Process has returned.
    56  	// The implementation should not block for a long time.
    57  	Pause()
    58  	// Resume resumes the paused process and its returning must send a result to pr channel.
    59  	Resume(ctx context.Context, pr chan pb.ProcessResult)
    60  	// Update updates the configuration
    61  	Update(ctx context.Context, cfg *config.SubTaskConfig) error
    62  
    63  	// Status returns the unit's current status. The result may need calculation with source status, like estimated time
    64  	// to catch up. If sourceStatus is nil, the calculation should be skipped.
    65  	Status(sourceStatus *binlog.SourceStatus) interface{}
    66  	// Type returns the unit's type
    67  	Type() pb.UnitType
    68  	// IsFreshTask return whether is a fresh task (not processed before)
    69  	// it will be used to decide where the task should become restoring
    70  	IsFreshTask(ctx context.Context) (bool, error)
    71  }
    72  
    73  // NewProcessError creates a new ProcessError
    74  // we can refine to add error scope field if needed.
    75  func NewProcessError(err error) *pb.ProcessError {
    76  	if e, ok := err.(*terror.Error); ok {
    77  		return &pb.ProcessError{
    78  			ErrCode:    int32(e.Code()),
    79  			ErrClass:   e.Class().String(),
    80  			ErrScope:   e.Scope().String(),
    81  			ErrLevel:   e.Level().String(),
    82  			Message:    terror.Message(e),
    83  			RawCause:   terror.Message(e.Cause()),
    84  			Workaround: e.Workaround(),
    85  		}
    86  	}
    87  
    88  	return &pb.ProcessError{
    89  		ErrCode:    int32(terror.ErrNotSet.Code()),
    90  		ErrClass:   terror.ErrNotSet.Class().String(),
    91  		ErrScope:   terror.ErrNotSet.Scope().String(),
    92  		ErrLevel:   terror.ErrNotSet.Level().String(),
    93  		Message:    terror.Message(err),
    94  		RawCause:   terror.Message(terror.ErrNotSet.Cause()),
    95  		Workaround: terror.ErrNotSet.Workaround(),
    96  	}
    97  }
    98  
    99  // IsCtxCanceledProcessErr returns true if the err's context canceled.
   100  func IsCtxCanceledProcessErr(err *pb.ProcessError) bool {
   101  	return strings.Contains(err.Message, "context canceled")
   102  }
   103  
   104  // JoinProcessErrors return the string of pb.ProcessErrors joined by ", ".
   105  func JoinProcessErrors(errors []*pb.ProcessError) string {
   106  	serrs := make([]string, 0, len(errors))
   107  	for _, serr := range errors {
   108  		serrs = append(serrs, serr.String())
   109  	}
   110  	return strings.Join(serrs, ", ")
   111  }
   112  
   113  // IsResumableError checks the error message and returns whether we need to
   114  // resume the task unit and retry.
   115  func IsResumableError(err *pb.ProcessError) bool {
   116  	if err == nil {
   117  		return true
   118  	}
   119  
   120  	// not elegant code, because TiDB doesn't expose some error
   121  	for _, msg := range retry.UnsupportedDDLMsgs {
   122  		if strings.Contains(strings.ToLower(err.RawCause), strings.ToLower(msg)) {
   123  			return false
   124  		}
   125  	}
   126  	for _, msg := range retry.UnsupportedDMLMsgs {
   127  		if strings.Contains(strings.ToLower(err.RawCause), strings.ToLower(msg)) {
   128  			return false
   129  		}
   130  	}
   131  	for _, msg := range retry.ReplicationErrMsgs {
   132  		if strings.Contains(strings.ToLower(err.RawCause), strings.ToLower(msg)) {
   133  			return false
   134  		}
   135  	}
   136  	if err.ErrCode == int32(terror.ErrParserParseRelayLog.Code()) {
   137  		for _, msg := range retry.ParseRelayLogErrMsgs {
   138  			if strings.Contains(strings.ToLower(err.Message), strings.ToLower(msg)) {
   139  				return false
   140  			}
   141  		}
   142  	}
   143  	if _, ok := retry.UnresumableErrCodes[err.ErrCode]; ok {
   144  		return false
   145  	}
   146  
   147  	return true
   148  }
   149  
   150  // IsResumableDBError checks whether the error is resumable DB error.
   151  // this is a simplified version of IsResumableError.
   152  // we use a blacklist to filter out some errors which can not be resumed,
   153  // all other errors is resumable.
   154  func IsResumableDBError(err error) bool {
   155  	if err == nil {
   156  		return true
   157  	}
   158  
   159  	err = errors.Cause(err)
   160  	if err == context.Canceled {
   161  		return false
   162  	}
   163  
   164  	// not elegant code, because TiDB doesn't expose some error
   165  	errStr := strings.ToLower(err.Error())
   166  	for _, msg := range retry.UnsupportedDDLMsgs {
   167  		if strings.Contains(errStr, strings.ToLower(msg)) {
   168  			return false
   169  		}
   170  	}
   171  	for _, msg := range retry.UnsupportedDMLMsgs {
   172  		if strings.Contains(errStr, strings.ToLower(msg)) {
   173  			return false
   174  		}
   175  	}
   176  	return true
   177  }
   178  
   179  // IsResumableRelayError return whether we need resume relay on error
   180  // since relay impl unit interface too, so we put it here.
   181  func IsResumableRelayError(err *pb.ProcessError) bool {
   182  	if _, ok := retry.UnresumableRelayErrCodes[err.ErrCode]; ok {
   183  		return false
   184  	}
   185  	return true
   186  }