github.com/pingcap/br@v5.3.0-alpha.0.20220125034240-ec59c7b6ce30+incompatible/pkg/backup/push.go (about) 1 // Copyright 2020 PingCAP, Inc. Licensed under Apache-2.0. 2 3 package backup 4 5 import ( 6 "context" 7 "fmt" 8 "sync" 9 10 "github.com/opentracing/opentracing-go" 11 "github.com/pingcap/errors" 12 "github.com/pingcap/failpoint" 13 backuppb "github.com/pingcap/kvproto/pkg/backup" 14 "github.com/pingcap/kvproto/pkg/metapb" 15 "go.uber.org/zap" 16 17 berrors "github.com/pingcap/br/pkg/errors" 18 "github.com/pingcap/br/pkg/logutil" 19 "github.com/pingcap/br/pkg/redact" 20 "github.com/pingcap/br/pkg/rtree" 21 "github.com/pingcap/br/pkg/utils" 22 ) 23 24 // pushDown wraps a backup task. 25 type pushDown struct { 26 mgr ClientMgr 27 respCh chan responseAndStore 28 errCh chan error 29 } 30 31 type responseAndStore struct { 32 Resp *backuppb.BackupResponse 33 Store *metapb.Store 34 } 35 36 func (r responseAndStore) GetResponse() *backuppb.BackupResponse { 37 return r.Resp 38 } 39 40 func (r responseAndStore) GetStore() *metapb.Store { 41 return r.Store 42 } 43 44 // newPushDown creates a push down backup. 45 func newPushDown(mgr ClientMgr, cap int) *pushDown { 46 return &pushDown{ 47 mgr: mgr, 48 respCh: make(chan responseAndStore, cap), 49 errCh: make(chan error, cap), 50 } 51 } 52 53 // FullBackup make a full backup of a tikv cluster. 54 func (push *pushDown) pushBackup( 55 ctx context.Context, 56 req backuppb.BackupRequest, 57 stores []*metapb.Store, 58 progressCallBack func(ProgressUnit), 59 ) (rtree.RangeTree, error) { 60 if span := opentracing.SpanFromContext(ctx); span != nil && span.Tracer() != nil { 61 span1 := span.Tracer().StartSpan("pushDown.pushBackup", opentracing.ChildOf(span.Context())) 62 defer span1.Finish() 63 ctx = opentracing.ContextWithSpan(ctx, span1) 64 } 65 66 // Push down backup tasks to all tikv instances. 67 res := rtree.NewRangeTree() 68 failpoint.Inject("noop-backup", func(_ failpoint.Value) { 69 logutil.CL(ctx).Warn("skipping normal backup, jump to fine-grained backup, meow :3", logutil.Key("start-key", req.StartKey), logutil.Key("end-key", req.EndKey)) 70 failpoint.Return(res, nil) 71 }) 72 73 wg := new(sync.WaitGroup) 74 for _, s := range stores { 75 store := s 76 storeID := s.GetId() 77 lctx := logutil.ContextWithField(ctx, zap.Uint64("store-id", storeID)) 78 if s.GetState() != metapb.StoreState_Up { 79 logutil.CL(lctx).Warn("skip store", zap.Stringer("State", s.GetState())) 80 continue 81 } 82 client, err := push.mgr.GetBackupClient(lctx, storeID) 83 if err != nil { 84 // BR should be able to backup even some of stores disconnected. 85 // The regions managed by this store can be retried at fine-grained backup then. 86 logutil.CL(lctx).Warn("fail to connect store, skipping", zap.Error(err)) 87 return res, nil 88 } 89 wg.Add(1) 90 go func() { 91 defer wg.Done() 92 err := SendBackup( 93 lctx, storeID, client, req, 94 func(resp *backuppb.BackupResponse) error { 95 // Forward all responses (including error). 96 push.respCh <- responseAndStore{ 97 Resp: resp, 98 Store: store, 99 } 100 return nil 101 }, 102 func() (backuppb.BackupClient, error) { 103 logutil.CL(lctx).Warn("reset the connection in push") 104 return push.mgr.ResetBackupClient(lctx, storeID) 105 }) 106 // Disconnected stores can be ignored. 107 if err != nil { 108 push.errCh <- err 109 return 110 } 111 }() 112 } 113 114 go func() { 115 wg.Wait() 116 // TODO: test concurrent receive response and close channel. 117 close(push.respCh) 118 }() 119 120 for { 121 select { 122 case respAndStore, ok := <-push.respCh: 123 resp := respAndStore.GetResponse() 124 store := respAndStore.GetStore() 125 if !ok { 126 // Finished. 127 return res, nil 128 } 129 failpoint.Inject("backup-storage-error", func(val failpoint.Value) { 130 msg := val.(string) 131 logutil.CL(ctx).Debug("failpoint backup-storage-error injected.", zap.String("msg", msg)) 132 resp.Error = &backuppb.Error{ 133 Msg: msg, 134 } 135 }) 136 failpoint.Inject("tikv-rw-error", func(val failpoint.Value) { 137 msg := val.(string) 138 logutil.CL(ctx).Debug("failpoint tikv-rw-error injected.", zap.String("msg", msg)) 139 resp.Error = &backuppb.Error{ 140 Msg: msg, 141 } 142 }) 143 if resp.GetError() == nil { 144 // None error means range has been backuped successfully. 145 res.Put( 146 resp.GetStartKey(), resp.GetEndKey(), resp.GetFiles()) 147 148 // Update progress 149 progressCallBack(RegionUnit) 150 } else { 151 errPb := resp.GetError() 152 switch v := errPb.Detail.(type) { 153 case *backuppb.Error_KvError: 154 logutil.CL(ctx).Warn("backup occur kv error", zap.Reflect("error", v)) 155 156 case *backuppb.Error_RegionError: 157 logutil.CL(ctx).Warn("backup occur region error", zap.Reflect("error", v)) 158 159 case *backuppb.Error_ClusterIdError: 160 logutil.CL(ctx).Error("backup occur cluster ID error", zap.Reflect("error", v)) 161 return res, errors.Annotatef(berrors.ErrKVClusterIDMismatch, "%v", errPb) 162 default: 163 if utils.MessageIsRetryableStorageError(errPb.GetMsg()) { 164 logutil.CL(ctx).Warn("backup occur storage error", zap.String("error", errPb.GetMsg())) 165 continue 166 } 167 if utils.MessageIsNotFoundStorageError(errPb.GetMsg()) { 168 errMsg := fmt.Sprintf("File or directory not found error occurs on TiKV Node(store id: %v; Address: %s)", store.GetId(), redact.String(store.GetAddress())) 169 logutil.CL(ctx).Error("", zap.String("error", berrors.ErrKVStorage.Error()+": "+errMsg), 170 zap.String("work around", "please ensure br and tikv node share a same disk and the user of br and tikv has same uid.")) 171 } 172 173 if utils.MessageIsPermissionDeniedStorageError(errPb.GetMsg()) { 174 errMsg := fmt.Sprintf("I/O permission denied error occurs on TiKV Node(store id: %v; Address: %s)", store.GetId(), redact.String(store.GetAddress())) 175 logutil.CL(ctx).Error("", zap.String("error", berrors.ErrKVStorage.Error()+": "+errMsg), 176 zap.String("work around", "please ensure tikv has permission to read from & write to the storage.")) 177 } 178 return res, berrors.ErrKVStorage 179 } 180 } 181 case err := <-push.errCh: 182 if !berrors.Is(err, berrors.ErrFailedToConnect) { 183 return res, errors.Annotatef(err, "failed to backup range [%s, %s)", redact.Key(req.StartKey), redact.Key(req.EndKey)) 184 } 185 logutil.CL(ctx).Warn("skipping disconnected stores", logutil.ShortError(err)) 186 return res, nil 187 } 188 } 189 }