github.com/pingcap/br@v5.3.0-alpha.0.20220125034240-ec59c7b6ce30+incompatible/pkg/backup/push.go (about)

     1  // Copyright 2020 PingCAP, Inc. Licensed under Apache-2.0.
     2  
     3  package backup
     4  
     5  import (
     6  	"context"
     7  	"fmt"
     8  	"sync"
     9  
    10  	"github.com/opentracing/opentracing-go"
    11  	"github.com/pingcap/errors"
    12  	"github.com/pingcap/failpoint"
    13  	backuppb "github.com/pingcap/kvproto/pkg/backup"
    14  	"github.com/pingcap/kvproto/pkg/metapb"
    15  	"go.uber.org/zap"
    16  
    17  	berrors "github.com/pingcap/br/pkg/errors"
    18  	"github.com/pingcap/br/pkg/logutil"
    19  	"github.com/pingcap/br/pkg/redact"
    20  	"github.com/pingcap/br/pkg/rtree"
    21  	"github.com/pingcap/br/pkg/utils"
    22  )
    23  
    24  // pushDown wraps a backup task.
    25  type pushDown struct {
    26  	mgr    ClientMgr
    27  	respCh chan responseAndStore
    28  	errCh  chan error
    29  }
    30  
    31  type responseAndStore struct {
    32  	Resp  *backuppb.BackupResponse
    33  	Store *metapb.Store
    34  }
    35  
    36  func (r responseAndStore) GetResponse() *backuppb.BackupResponse {
    37  	return r.Resp
    38  }
    39  
    40  func (r responseAndStore) GetStore() *metapb.Store {
    41  	return r.Store
    42  }
    43  
    44  // newPushDown creates a push down backup.
    45  func newPushDown(mgr ClientMgr, cap int) *pushDown {
    46  	return &pushDown{
    47  		mgr:    mgr,
    48  		respCh: make(chan responseAndStore, cap),
    49  		errCh:  make(chan error, cap),
    50  	}
    51  }
    52  
    53  // FullBackup make a full backup of a tikv cluster.
    54  func (push *pushDown) pushBackup(
    55  	ctx context.Context,
    56  	req backuppb.BackupRequest,
    57  	stores []*metapb.Store,
    58  	progressCallBack func(ProgressUnit),
    59  ) (rtree.RangeTree, error) {
    60  	if span := opentracing.SpanFromContext(ctx); span != nil && span.Tracer() != nil {
    61  		span1 := span.Tracer().StartSpan("pushDown.pushBackup", opentracing.ChildOf(span.Context()))
    62  		defer span1.Finish()
    63  		ctx = opentracing.ContextWithSpan(ctx, span1)
    64  	}
    65  
    66  	// Push down backup tasks to all tikv instances.
    67  	res := rtree.NewRangeTree()
    68  	failpoint.Inject("noop-backup", func(_ failpoint.Value) {
    69  		logutil.CL(ctx).Warn("skipping normal backup, jump to fine-grained backup, meow :3", logutil.Key("start-key", req.StartKey), logutil.Key("end-key", req.EndKey))
    70  		failpoint.Return(res, nil)
    71  	})
    72  
    73  	wg := new(sync.WaitGroup)
    74  	for _, s := range stores {
    75  		store := s
    76  		storeID := s.GetId()
    77  		lctx := logutil.ContextWithField(ctx, zap.Uint64("store-id", storeID))
    78  		if s.GetState() != metapb.StoreState_Up {
    79  			logutil.CL(lctx).Warn("skip store", zap.Stringer("State", s.GetState()))
    80  			continue
    81  		}
    82  		client, err := push.mgr.GetBackupClient(lctx, storeID)
    83  		if err != nil {
    84  			// BR should be able to backup even some of stores disconnected.
    85  			// The regions managed by this store can be retried at fine-grained backup then.
    86  			logutil.CL(lctx).Warn("fail to connect store, skipping", zap.Error(err))
    87  			return res, nil
    88  		}
    89  		wg.Add(1)
    90  		go func() {
    91  			defer wg.Done()
    92  			err := SendBackup(
    93  				lctx, storeID, client, req,
    94  				func(resp *backuppb.BackupResponse) error {
    95  					// Forward all responses (including error).
    96  					push.respCh <- responseAndStore{
    97  						Resp:  resp,
    98  						Store: store,
    99  					}
   100  					return nil
   101  				},
   102  				func() (backuppb.BackupClient, error) {
   103  					logutil.CL(lctx).Warn("reset the connection in push")
   104  					return push.mgr.ResetBackupClient(lctx, storeID)
   105  				})
   106  			// Disconnected stores can be ignored.
   107  			if err != nil {
   108  				push.errCh <- err
   109  				return
   110  			}
   111  		}()
   112  	}
   113  
   114  	go func() {
   115  		wg.Wait()
   116  		// TODO: test concurrent receive response and close channel.
   117  		close(push.respCh)
   118  	}()
   119  
   120  	for {
   121  		select {
   122  		case respAndStore, ok := <-push.respCh:
   123  			resp := respAndStore.GetResponse()
   124  			store := respAndStore.GetStore()
   125  			if !ok {
   126  				// Finished.
   127  				return res, nil
   128  			}
   129  			failpoint.Inject("backup-storage-error", func(val failpoint.Value) {
   130  				msg := val.(string)
   131  				logutil.CL(ctx).Debug("failpoint backup-storage-error injected.", zap.String("msg", msg))
   132  				resp.Error = &backuppb.Error{
   133  					Msg: msg,
   134  				}
   135  			})
   136  			failpoint.Inject("tikv-rw-error", func(val failpoint.Value) {
   137  				msg := val.(string)
   138  				logutil.CL(ctx).Debug("failpoint tikv-rw-error injected.", zap.String("msg", msg))
   139  				resp.Error = &backuppb.Error{
   140  					Msg: msg,
   141  				}
   142  			})
   143  			if resp.GetError() == nil {
   144  				// None error means range has been backuped successfully.
   145  				res.Put(
   146  					resp.GetStartKey(), resp.GetEndKey(), resp.GetFiles())
   147  
   148  				// Update progress
   149  				progressCallBack(RegionUnit)
   150  			} else {
   151  				errPb := resp.GetError()
   152  				switch v := errPb.Detail.(type) {
   153  				case *backuppb.Error_KvError:
   154  					logutil.CL(ctx).Warn("backup occur kv error", zap.Reflect("error", v))
   155  
   156  				case *backuppb.Error_RegionError:
   157  					logutil.CL(ctx).Warn("backup occur region error", zap.Reflect("error", v))
   158  
   159  				case *backuppb.Error_ClusterIdError:
   160  					logutil.CL(ctx).Error("backup occur cluster ID error", zap.Reflect("error", v))
   161  					return res, errors.Annotatef(berrors.ErrKVClusterIDMismatch, "%v", errPb)
   162  				default:
   163  					if utils.MessageIsRetryableStorageError(errPb.GetMsg()) {
   164  						logutil.CL(ctx).Warn("backup occur storage error", zap.String("error", errPb.GetMsg()))
   165  						continue
   166  					}
   167  					if utils.MessageIsNotFoundStorageError(errPb.GetMsg()) {
   168  						errMsg := fmt.Sprintf("File or directory not found error occurs on TiKV Node(store id: %v; Address: %s)", store.GetId(), redact.String(store.GetAddress()))
   169  						logutil.CL(ctx).Error("", zap.String("error", berrors.ErrKVStorage.Error()+": "+errMsg),
   170  							zap.String("work around", "please ensure br and tikv node share a same disk and the user of br and tikv has same uid."))
   171  					}
   172  
   173  					if utils.MessageIsPermissionDeniedStorageError(errPb.GetMsg()) {
   174  						errMsg := fmt.Sprintf("I/O permission denied error occurs on TiKV Node(store id: %v; Address: %s)", store.GetId(), redact.String(store.GetAddress()))
   175  						logutil.CL(ctx).Error("", zap.String("error", berrors.ErrKVStorage.Error()+": "+errMsg),
   176  							zap.String("work around", "please ensure tikv has permission to read from & write to the storage."))
   177  					}
   178  					return res, berrors.ErrKVStorage
   179  				}
   180  			}
   181  		case err := <-push.errCh:
   182  			if !berrors.Is(err, berrors.ErrFailedToConnect) {
   183  				return res, errors.Annotatef(err, "failed to backup range [%s, %s)", redact.Key(req.StartKey), redact.Key(req.EndKey))
   184  			}
   185  			logutil.CL(ctx).Warn("skipping disconnected stores", logutil.ShortError(err))
   186  			return res, nil
   187  		}
   188  	}
   189  }