github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/syncer/safe_mode.go (about)

     1  // Copyright 2019 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package syncer
    15  
    16  import (
    17  	"time"
    18  
    19  	"github.com/pingcap/failpoint"
    20  	"github.com/pingcap/tiflow/dm/pkg/binlog"
    21  	tcontext "github.com/pingcap/tiflow/dm/pkg/context"
    22  	"github.com/pingcap/tiflow/dm/pkg/terror"
    23  	"github.com/pingcap/tiflow/dm/unit"
    24  	"go.uber.org/zap"
    25  )
    26  
    27  func (s *Syncer) enableSafeModeByTaskCliArgs(tctx *tcontext.Context) {
    28  	//nolint:errcheck
    29  	s.safeMode.Add(tctx, 1)
    30  	s.tctx.L().Info("enable safe-mode because of task cli args")
    31  }
    32  
    33  func (s *Syncer) enableSafeModeInitializationPhase(tctx *tcontext.Context) {
    34  	var err error
    35  	defer func() {
    36  		if err != nil {
    37  			// send error to the fatal chan to interrupt the process
    38  			s.runFatalChan <- unit.NewProcessError(err)
    39  		}
    40  	}()
    41  
    42  	s.safeMode.Reset(tctx) // in initialization phase, reset first
    43  
    44  	// cliArgs has higher priority than config
    45  	if s.cliArgs != nil && s.cliArgs.SafeModeDuration != "" {
    46  		s.enableSafeModeByTaskCliArgs(tctx)
    47  		return
    48  	}
    49  
    50  	if s.cfg.SafeMode {
    51  		//nolint:errcheck
    52  		s.safeMode.Add(tctx, 1) // add 1 but has no corresponding -1, so keeps enabled
    53  		s.tctx.L().Info("enable safe-mode by config")
    54  	}
    55  
    56  	var duration time.Duration
    57  	initPhaseSeconds := s.cfg.SafeModeDuration
    58  
    59  	failpoint.Inject("SafeModeInitPhaseSeconds", func(val failpoint.Value) {
    60  		initPhaseSeconds = val.(string)
    61  		s.tctx.L().Info("set initPhaseSeconds", zap.String("failpoint", "SafeModeInitPhaseSeconds"), zap.String("value", initPhaseSeconds))
    62  	})
    63  	if initPhaseSeconds == "" {
    64  		duration = time.Second * time.Duration(2*s.cfg.CheckpointFlushInterval)
    65  	} else {
    66  		duration, err = time.ParseDuration(initPhaseSeconds)
    67  		if err != nil {
    68  			s.tctx.L().Error("enable safe-mode failed due to duration parse failed", zap.String("duration", initPhaseSeconds))
    69  			return
    70  		}
    71  	}
    72  	exitPoint := s.checkpoint.SafeModeExitPoint()
    73  	if exitPoint != nil {
    74  		beginLocation := s.checkpoint.GlobalPoint()
    75  		s.tctx.L().Info("compare exitPoint and beginLocation", zap.Stringer("exitPoint", exitPoint), zap.Stringer("beginLocation", beginLocation))
    76  		if binlog.CompareLocation(*exitPoint, beginLocation, s.cfg.EnableGTID) == 0 {
    77  			s.tctx.L().Info("exitPoint equal to beginLocation, so disable the safe mode")
    78  			s.checkpoint.SaveSafeModeExitPoint(nil)
    79  			// must flush here to avoid the following situation:
    80  			// 1. quit safe mode
    81  			// 2. push forward and replicate some sqls after safeModeExitPoint to downstream
    82  			// 3. quit because of network error, fail to flush global checkpoint and new safeModeExitPoint to downstream
    83  			// 4. restart again, quit safe mode at safeModeExitPoint, but some sqls after this location have already been replicated to the downstream
    84  			err = s.checkpoint.FlushSafeModeExitPoint(s.runCtx)
    85  			return
    86  		}
    87  		if duration == 0 {
    88  			err = terror.ErrSyncerReprocessWithSafeModeFail.Generate()
    89  			s.tctx.L().Error("safe-mode-duration=0 is conflict with that exitPoint not equal to beginLocation", zap.Error(err))
    90  			return
    91  		}
    92  		//nolint:errcheck
    93  		s.safeMode.Add(tctx, 1) // enable and will revert after pass SafeModeExitLoc
    94  		s.tctx.L().Info("enable safe-mode for safe mode exit point, will exit at", zap.Stringer("location", *exitPoint))
    95  	} else {
    96  		s.tctx.L().Info("enable safe-mode because of task initialization", zap.Duration("duration", duration))
    97  
    98  		if duration > 0 {
    99  			//nolint:errcheck
   100  			s.safeMode.Add(tctx, 1) // enable and will revert after 2 * CheckpointFlushInterval
   101  			go func() {
   102  				defer func() {
   103  					err2 := s.safeMode.Add(tctx, -1)
   104  					if err2 != nil {
   105  						s.runFatalChan <- unit.NewProcessError(err2)
   106  					}
   107  					if !s.safeMode.Enable() {
   108  						s.tctx.L().Info("disable safe-mode after task initialization finished")
   109  					}
   110  				}()
   111  
   112  				select {
   113  				case <-tctx.Context().Done():
   114  				case <-time.After(duration):
   115  				}
   116  			}()
   117  		}
   118  	}
   119  }