github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/pkg/ha/relay.go (about) 1 // Copyright 2021 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package ha 15 16 import ( 17 "context" 18 "fmt" 19 "time" 20 21 "github.com/pingcap/tiflow/dm/common" 22 "github.com/pingcap/tiflow/dm/config" 23 "github.com/pingcap/tiflow/dm/pkg/etcdutil" 24 "github.com/pingcap/tiflow/dm/pkg/log" 25 "github.com/pingcap/tiflow/dm/pkg/terror" 26 "go.etcd.io/etcd/api/v3/mvccpb" 27 clientv3 "go.etcd.io/etcd/client/v3" 28 "go.uber.org/zap" 29 ) 30 31 // RelaySource represents the bound relationship between the DM-worker instance and its upstream relay source. 32 type RelaySource struct { 33 Source string 34 // only used to report to the caller of the watcher, do not marsh it. 35 // if it's true, it means the bound has been deleted in etcd. 36 IsDeleted bool 37 // record the etcd ModRevision of this bound 38 Revision int64 39 } 40 41 // PutRelayConfig puts the relay config for given workers. 42 // k/v: worker-name -> source-id. 43 // TODO: let caller wait until worker has enabled relay. 44 func PutRelayConfig(cli *clientv3.Client, source string, workers ...string) (int64, error) { 45 ops := make([]clientv3.Op, 0, len(workers)) 46 for _, worker := range workers { 47 ops = append(ops, putRelayConfigOp(worker, source)) 48 } 49 _, rev, err := etcdutil.DoTxnWithRepeatable(cli, etcdutil.ThenOpFunc(ops...)) 50 return rev, err 51 } 52 53 // DeleteRelayConfig deletes the relay config for given workers. 54 func DeleteRelayConfig(cli *clientv3.Client, workers ...string) (int64, error) { 55 ops := make([]clientv3.Op, 0, len(workers)) 56 for _, worker := range workers { 57 ops = append(ops, deleteRelayConfigOp(worker)) 58 } 59 _, rev, err := etcdutil.DoTxnWithRepeatable(cli, etcdutil.ThenOpFunc(ops...)) 60 return rev, err 61 } 62 63 // GetAllRelayConfig gets all source and its relay worker. 64 // k/v: source ID -> set(workers). 65 func GetAllRelayConfig(cli *clientv3.Client) (map[string]map[string]struct{}, int64, error) { 66 ctx, cancel := context.WithTimeout(cli.Ctx(), etcdutil.DefaultRequestTimeout) 67 defer cancel() 68 69 resp, err := cli.Get(ctx, common.UpstreamRelayWorkerKeyAdapter.Path(), clientv3.WithPrefix()) 70 if err != nil { 71 return nil, 0, terror.ErrHAFailTxnOperation.Delegate(err, "fail to get all relay config") 72 } 73 74 ret := map[string]map[string]struct{}{} 75 for _, kv := range resp.Kvs { 76 source := string(kv.Value) 77 keys, err2 := common.UpstreamRelayWorkerKeyAdapter.Decode(string(kv.Key)) 78 if err2 != nil { 79 return nil, 0, err2 80 } 81 if len(keys) != 1 { 82 // should not happened 83 return nil, 0, terror.Annotate(err, "illegal key of UpstreamRelayWorkerKeyAdapter") 84 } 85 worker := keys[0] 86 var ( 87 ok bool 88 workers map[string]struct{} 89 ) 90 if workers, ok = ret[source]; !ok { 91 workers = map[string]struct{}{} 92 ret[source] = workers 93 } 94 workers[worker] = struct{}{} 95 } 96 return ret, resp.Header.Revision, nil 97 } 98 99 // GetRelayConfig returns the source config which the given worker need to pull relay log from etcd, with revision. 100 func GetRelayConfig(cli *clientv3.Client, worker string) (*config.SourceConfig, int64, error) { 101 var ( 102 source string 103 newSource string 104 rev int64 105 retryNum = defaultGetRelayConfigRetry 106 ) 107 ctx, cancel := context.WithTimeout(cli.Ctx(), etcdutil.DefaultRequestTimeout) 108 defer cancel() 109 110 getSourceIDFromResp := func(resp *clientv3.GetResponse) (string, int64, error) { 111 if resp.Count == 0 { 112 return "", resp.Header.Revision, nil 113 } 114 if resp.Count > 1 { 115 return "", resp.Header.Revision, terror.ErrConfigMoreThanOne.Generate(resp.Count, "relay relationship", "worker: "+worker) 116 } 117 return string(resp.Kvs[0].Value), resp.Header.Revision, nil 118 } 119 120 resp, err := cli.Get(ctx, common.UpstreamRelayWorkerKeyAdapter.Encode(worker)) 121 if err != nil { 122 return nil, 0, terror.ErrHAFailTxnOperation.Delegate(err, "fail to get relay config") 123 } 124 source, rev, err = getSourceIDFromResp(resp) 125 if err != nil || source == "" { 126 return nil, rev, err 127 } 128 129 for retryCnt := 1; retryCnt <= retryNum; retryCnt++ { 130 txnResp, _, err2 := etcdutil.DoTxnWithRepeatable(cli, etcdutil.ThenOpFunc( 131 clientv3.OpGet(common.UpstreamRelayWorkerKeyAdapter.Encode(worker)), 132 clientv3.OpGet(common.UpstreamConfigKeyAdapter.Encode(source)))) 133 if err2 != nil { 134 return nil, 0, err 135 } 136 137 var rev2 int64 138 sourceResp := txnResp.Responses[0].GetResponseRange() 139 newSource, rev2, err = getSourceIDFromResp((*clientv3.GetResponse)(sourceResp)) 140 if err != nil { 141 return nil, 0, err 142 } 143 144 if newSource != source { 145 log.L().Warn("relay config has been changed, will take a retry", 146 zap.String("old relay source", source), 147 zap.String("new relay source", newSource), 148 zap.Int("retryTime", retryCnt)) 149 // if we are about to fail, don't update relay source to save the last source to error 150 if retryCnt != retryNum { 151 source = newSource 152 } 153 select { 154 case <-cli.Ctx().Done(): 155 retryNum = 0 // stop retry 156 case <-time.After(retryInterval): 157 // retryInterval shouldn't be too long because the longer we wait, bound is more 158 // possible to be different from newBound 159 } 160 continue 161 } 162 // newSource == source == "" means this relay source is truly deleted 163 if newSource == "" { 164 return nil, rev2, nil 165 } 166 167 cfgResp := txnResp.Responses[1].GetResponseRange() 168 scm, err3 := sourceCfgFromResp(newSource, (*clientv3.GetResponse)(cfgResp)) 169 if err3 != nil { 170 return nil, 0, err3 171 } 172 cfg, ok := scm[newSource] 173 // ok == false means we have got relay source but there is no source config, this shouldn't happen 174 if !ok { 175 // this should not happen. 176 return nil, 0, terror.ErrConfigMissingForBound.Generate(source) 177 } 178 179 return cfg, rev2, nil 180 } 181 return nil, 0, terror.ErrWorkerRelayConfigChanging.Generate(worker, source, newSource) 182 } 183 184 // putRelayConfigOp returns PUT etcd operations for the relay relationship of the specified DM-worker. 185 // k/v: worker-name -> source-id. 186 func putRelayConfigOp(worker, source string) clientv3.Op { 187 return clientv3.OpPut(common.UpstreamRelayWorkerKeyAdapter.Encode(worker), source) 188 } 189 190 // deleteRelayConfigOp returns a DELETE etcd operation for the relay relationship of the specified DM-worker. 191 func deleteRelayConfigOp(worker string) clientv3.Op { 192 return clientv3.OpDelete(common.UpstreamRelayWorkerKeyAdapter.Encode(worker)) 193 } 194 195 // WatchRelayConfig watches PUT & DELETE operations for the relay relationship of the specified DM-worker. 196 // For the DELETE operations, it returns an nil source config. 197 func WatchRelayConfig(ctx context.Context, cli *clientv3.Client, 198 worker string, revision int64, outCh chan<- RelaySource, errCh chan<- error, 199 ) { 200 wCtx, cancel := context.WithCancel(ctx) 201 defer cancel() 202 ch := cli.Watch(wCtx, common.UpstreamRelayWorkerKeyAdapter.Encode(worker), clientv3.WithRev(revision)) 203 204 for { 205 select { 206 case <-ctx.Done(): 207 return 208 case resp, ok := <-ch: 209 if !ok { 210 return 211 } 212 if resp.Canceled { 213 // TODO(csuzhangxc): do retry here. 214 if resp.Err() != nil { 215 select { 216 case errCh <- terror.ErrHAFailWatchEtcd.Delegate(resp.Err(), fmt.Sprintf("watch relay config canceled, worker %s", worker)): 217 case <-ctx.Done(): 218 } 219 } 220 return 221 } 222 223 for _, ev := range resp.Events { 224 var bound RelaySource 225 switch ev.Type { 226 case mvccpb.PUT: 227 bound.Source = string(ev.Kv.Value) 228 bound.IsDeleted = false 229 case mvccpb.DELETE: 230 bound.IsDeleted = true 231 default: 232 // this should not happen. 233 log.L().Error("unsupported etcd event type", zap.Reflect("kv", ev.Kv), zap.Reflect("type", ev.Type)) 234 continue 235 } 236 bound.Revision = ev.Kv.ModRevision 237 238 select { 239 case outCh <- bound: 240 case <-ctx.Done(): 241 return 242 } 243 } 244 } 245 } 246 }