github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/pkg/ha/bound.go (about) 1 // Copyright 2020 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package ha 15 16 import ( 17 "context" 18 "encoding/json" 19 "fmt" 20 "time" 21 22 "github.com/pingcap/failpoint" 23 "github.com/pingcap/tiflow/dm/common" 24 "github.com/pingcap/tiflow/dm/config" 25 "github.com/pingcap/tiflow/dm/pkg/etcdutil" 26 "github.com/pingcap/tiflow/dm/pkg/log" 27 "github.com/pingcap/tiflow/dm/pkg/terror" 28 "go.etcd.io/etcd/api/v3/mvccpb" 29 clientv3 "go.etcd.io/etcd/client/v3" 30 "go.uber.org/zap" 31 ) 32 33 const ( 34 // we need two steps to get a name/id and query config using that id. 35 // since above steps can't be put into one etcd transaction, we combine and re-run the first step into the second 36 // step, and check the name/id is still valid. if not valid, retry the second step using new name/id. 37 defaultGetSourceBoundConfigRetry = 3 38 defaultGetRelayConfigRetry = 3 39 retryInterval = 50 * time.Millisecond // retry interval when we get two different bounds 40 ) 41 42 // SourceBound represents the bound relationship between the DM-worker instance and the upstream MySQL source. 43 type SourceBound struct { 44 Source string `json:"source"` // the source ID of the upstream. 45 Worker string `json:"worker"` // the name of the bound DM-worker for the source. 46 47 // only used to report to the caller of the watcher, do not marsh it. 48 // if it's true, it means the bound has been deleted in etcd. 49 IsDeleted bool `json:"-"` 50 // record the etcd Revision of this bound 51 Revision int64 `json:"-"` 52 } 53 54 // NewSourceBound creates a new SourceBound instance. 55 func NewSourceBound(source, worker string) SourceBound { 56 return SourceBound{ 57 Source: source, 58 Worker: worker, 59 } 60 } 61 62 // String implements Stringer interface. 63 func (b SourceBound) String() string { 64 s, _ := b.toJSON() 65 return s 66 } 67 68 // toJSON returns the string of JSON represent. 69 func (b SourceBound) toJSON() (string, error) { 70 data, err := json.Marshal(b) 71 if err != nil { 72 return "", terror.ErrHAInvalidItem.Delegate(err, fmt.Sprintf("fail to marshal SourceBound %+v", b)) 73 } 74 return string(data), nil 75 } 76 77 // IsEmpty returns true when this bound has no value. 78 func (b SourceBound) IsEmpty() bool { 79 var emptyBound SourceBound 80 return b == emptyBound 81 } 82 83 // sourceBoundFromJSON constructs SourceBound from its JSON represent. 84 func sourceBoundFromJSON(s string) (b SourceBound, err error) { 85 if err = json.Unmarshal([]byte(s), &b); err != nil { 86 err = terror.ErrHAInvalidItem.Delegate(err, fmt.Sprintf("fail to unmarshal SourceBound %s", s)) 87 } 88 return 89 } 90 91 // PutSourceBound puts the bound relationship into etcd. 92 // k/v: worker-name -> bound relationship. 93 func PutSourceBound(cli *clientv3.Client, bounds ...SourceBound) (int64, error) { 94 ops := make([]clientv3.Op, 0, len(bounds)) 95 for _, bound := range bounds { 96 boundOps, err := putSourceBoundOp(bound) 97 if err != nil { 98 return 0, err 99 } 100 ops = append(ops, boundOps...) 101 } 102 _, rev, err := etcdutil.DoTxnWithRepeatable(cli, etcdutil.ThenOpFunc(ops...)) 103 return rev, err 104 } 105 106 // DeleteSourceBound deletes the bound relationship in etcd for the specified worker. 107 func DeleteSourceBound(cli *clientv3.Client, workers ...string) (int64, error) { 108 ops := make([]clientv3.Op, 0, len(workers)) 109 for _, worker := range workers { 110 ops = append(ops, deleteSourceBoundOp(worker)...) 111 } 112 _, rev, err := etcdutil.DoTxnWithRepeatable(cli, etcdutil.ThenOpFunc(ops...)) 113 return rev, err 114 } 115 116 // ReplaceSourceBound deletes an old bound and puts a new bound in one transaction, so a bound source will not become 117 // unbound because of failing halfway. 118 func ReplaceSourceBound(cli *clientv3.Client, source, oldWorker, newWorker string) (int64, error) { 119 deleteOps := deleteSourceBoundOp(oldWorker) 120 putOps, err := putSourceBoundOp(NewSourceBound(source, newWorker)) 121 if err != nil { 122 return 0, err 123 } 124 ops := make([]clientv3.Op, 0, len(deleteOps)+len(putOps)) 125 ops = append(ops, deleteOps...) 126 ops = append(ops, putOps...) 127 _, rev, err := etcdutil.DoTxnWithRepeatable(cli, etcdutil.ThenOpFunc(ops...)) 128 return rev, err 129 } 130 131 // GetSourceBound gets the source bound relationship for the specified DM-worker. 132 // if the bound relationship for the worker name not exist, return with `err == nil`. 133 // if the worker name is "", it will return all bound relationships as a map{worker-name: bound}. 134 // if the worker name is given, it will return a map{worker-name: bound} whose length is 1. 135 func GetSourceBound(cli *clientv3.Client, worker string) (map[string]SourceBound, int64, error) { 136 ctx, cancel := context.WithTimeout(cli.Ctx(), etcdutil.DefaultRequestTimeout) 137 defer cancel() 138 139 var ( 140 sbm = make(map[string]SourceBound) 141 resp *clientv3.GetResponse 142 err error 143 ) 144 failpoint.Inject("FailToGetSourceCfg", func() { 145 failpoint.Return(sbm, 0, context.DeadlineExceeded) 146 }) 147 if worker != "" { 148 resp, err = cli.Get(ctx, common.UpstreamBoundWorkerKeyAdapter.Encode(worker)) 149 } else { 150 resp, err = cli.Get(ctx, common.UpstreamBoundWorkerKeyAdapter.Path(), clientv3.WithPrefix()) 151 } 152 153 if err != nil { 154 return sbm, 0, terror.ErrHAFailTxnOperation.Delegate(err, "fail to get bound relationship") 155 } 156 157 sbm, err = sourceBoundFromResp(worker, resp) 158 if err != nil { 159 return sbm, 0, err 160 } 161 162 return sbm, resp.Header.Revision, nil 163 } 164 165 // GetLastSourceBounds gets all last source bound relationship. Different with GetSourceBound, "last source bound" will 166 // not be deleted when worker offline. 167 func GetLastSourceBounds(cli *clientv3.Client) (map[string]SourceBound, int64, error) { 168 ctx, cancel := context.WithTimeout(cli.Ctx(), etcdutil.DefaultRequestTimeout) 169 defer cancel() 170 171 sbm := make(map[string]SourceBound) 172 resp, err := cli.Get(ctx, common.UpstreamLastBoundWorkerKeyAdapter.Path(), clientv3.WithPrefix()) 173 if err != nil { 174 return sbm, 0, terror.ErrHAFailTxnOperation.Delegate(err, "fail to get last bound relationship") 175 } 176 177 sbm, err = sourceBoundFromResp("", resp) 178 if err != nil { 179 return sbm, 0, err 180 } 181 182 return sbm, resp.Header.Revision, nil 183 } 184 185 // GetSourceBoundConfig gets the source bound relationship and relative source config at the same time 186 // for the specified DM-worker. The index worker **must not be empty**: 187 // if source bound is empty, will return an empty sourceBound and an empty source config 188 // if source bound is not empty but sourceConfig is empty, will return an error 189 // if the source bound is different for over retryNum times, will return an error. 190 func GetSourceBoundConfig(cli *clientv3.Client, worker string) (SourceBound, *config.SourceConfig, int64, error) { 191 var ( 192 bound SourceBound 193 newBound SourceBound 194 cfg *config.SourceConfig 195 ok bool 196 retryNum = defaultGetSourceBoundConfigRetry 197 ) 198 sbm, rev, err := GetSourceBound(cli, worker) 199 if err != nil { 200 return bound, cfg, 0, err 201 } 202 if bound, ok = sbm[worker]; !ok { 203 return bound, cfg, rev, nil 204 } 205 206 for retryCnt := 1; retryCnt <= retryNum; retryCnt++ { 207 txnResp, rev2, err2 := etcdutil.DoTxnWithRepeatable(cli, etcdutil.ThenOpFunc(clientv3.OpGet(common.UpstreamBoundWorkerKeyAdapter.Encode(worker)), 208 clientv3.OpGet(common.UpstreamConfigKeyAdapter.Encode(bound.Source)))) 209 if err2 != nil { 210 return bound, cfg, 0, err2 211 } 212 213 boundResp := txnResp.Responses[0].GetResponseRange() 214 sbm2, err2 := sourceBoundFromResp(worker, (*clientv3.GetResponse)(boundResp)) 215 if err2 != nil { 216 return bound, cfg, 0, err2 217 } 218 219 newBound, ok = sbm2[worker] 220 // when ok is false, newBound will be empty which means bound for this worker has been deleted in this turn 221 // if bound is not empty, we should wait for another turn to make sure bound is really deleted. 222 if newBound != bound { 223 log.L().Warn("source bound has been changed, will take a retry", zap.Stringer("oldBound", bound), 224 zap.Stringer("newBound", newBound), zap.Int("retryTime", retryCnt)) 225 // if we are about to fail, don't update bound to save the last bound to error 226 if retryCnt != retryNum { 227 bound = newBound 228 } 229 select { 230 case <-cli.Ctx().Done(): 231 retryNum = 0 // stop retry 232 case <-time.After(retryInterval): 233 // retryInterval shouldn't be too long because the longer we wait, bound is more 234 // possible to be different from newBound 235 } 236 continue 237 } 238 // ok == false and newBound == bound means this bound is truly deleted, we don't need source config anymore 239 if !ok { 240 return bound, cfg, rev2, nil 241 } 242 243 cfgResp := txnResp.Responses[1].GetResponseRange() 244 scm, err2 := sourceCfgFromResp(bound.Source, (*clientv3.GetResponse)(cfgResp)) 245 if err2 != nil { 246 return bound, cfg, 0, err2 247 } 248 cfg, ok = scm[bound.Source] 249 // ok == false means we have got source bound but there is no source config, this shouldn't happen 250 if !ok { 251 // this should not happen. 252 return bound, cfg, 0, terror.ErrConfigMissingForBound.Generate(bound) 253 } 254 255 return bound, cfg, rev2, nil 256 } 257 258 return bound, cfg, 0, terror.ErrMasterBoundChanging.Generate(bound, newBound) 259 } 260 261 // WatchSourceBound watches PUT & DELETE operations for the bound relationship of the specified DM-worker. 262 // For the DELETE operations, it returns an empty bound relationship. 263 // nolint:dupl 264 func WatchSourceBound(ctx context.Context, cli *clientv3.Client, worker string, revision int64, outCh chan<- SourceBound, errCh chan<- error) { 265 wCtx, cancel := context.WithCancel(ctx) 266 defer cancel() 267 ch := cli.Watch(wCtx, common.UpstreamBoundWorkerKeyAdapter.Encode(worker), clientv3.WithRev(revision)) 268 269 for { 270 select { 271 case <-ctx.Done(): 272 return 273 case resp, ok := <-ch: 274 if !ok { 275 return 276 } 277 if resp.Canceled { 278 // TODO(csuzhangxc): do retry here. 279 if resp.Err() != nil { 280 select { 281 case errCh <- terror.ErrHAFailWatchEtcd.Delegate(resp.Err(), "watch source bound key canceled"): 282 case <-ctx.Done(): 283 } 284 } 285 return 286 } 287 288 for _, ev := range resp.Events { 289 var ( 290 bound SourceBound 291 err error 292 ) 293 switch ev.Type { 294 case mvccpb.PUT: 295 bound, err = sourceBoundFromJSON(string(ev.Kv.Value)) 296 case mvccpb.DELETE: 297 bound, err = sourceBoundFromKey(string(ev.Kv.Key)) 298 bound.IsDeleted = true 299 default: 300 // this should not happen. 301 log.L().Error("unsupported etcd event type", zap.Reflect("kv", ev.Kv), zap.Reflect("type", ev.Type)) 302 continue 303 } 304 bound.Revision = ev.Kv.ModRevision 305 306 if err != nil { 307 select { 308 case errCh <- err: 309 case <-ctx.Done(): 310 return 311 } 312 } else { 313 select { 314 case outCh <- bound: 315 case <-ctx.Done(): 316 return 317 } 318 } 319 } 320 } 321 } 322 } 323 324 // sourceBoundFromKey constructs an incomplete bound relationship from an etcd key. 325 func sourceBoundFromKey(key string) (SourceBound, error) { 326 var bound SourceBound 327 ks, err := common.UpstreamBoundWorkerKeyAdapter.Decode(key) 328 if err != nil { 329 return bound, err 330 } 331 bound.Worker = ks[0] 332 return bound, nil 333 } 334 335 func sourceBoundFromResp(worker string, resp *clientv3.GetResponse) (map[string]SourceBound, error) { 336 sbm := make(map[string]SourceBound) 337 if resp.Count == 0 { 338 return sbm, nil 339 } else if worker != "" && resp.Count > 1 { 340 // this should not happen. 341 return sbm, terror.ErrConfigMoreThanOne.Generate(resp.Count, "bound relationship", "worker: "+worker) 342 } 343 344 for _, kvs := range resp.Kvs { 345 bound, err := sourceBoundFromJSON(string(kvs.Value)) 346 if err != nil { 347 return sbm, err 348 } 349 bound.Revision = kvs.ModRevision 350 sbm[bound.Worker] = bound 351 } 352 return sbm, nil 353 } 354 355 // deleteSourceBoundOp returns a DELETE etcd operation for the bound relationship of the specified DM-worker. 356 func deleteSourceBoundOp(worker string) []clientv3.Op { 357 return []clientv3.Op{ 358 clientv3.OpDelete(common.UpstreamBoundWorkerKeyAdapter.Encode(worker)), 359 } 360 } 361 362 // deleteLastSourceBoundOp returns a DELETE etcd operation for the last bound relationship of the specified DM-worker. 363 func deleteLastSourceBoundOp(worker string) clientv3.Op { 364 return clientv3.OpDelete(common.UpstreamLastBoundWorkerKeyAdapter.Encode(worker)) 365 } 366 367 // putSourceBoundOp returns PUT etcd operations for the bound relationship. 368 // k/v: worker-name -> bound relationship. 369 func putSourceBoundOp(bound SourceBound) ([]clientv3.Op, error) { 370 value, err := bound.toJSON() 371 if err != nil { 372 return []clientv3.Op{}, err 373 } 374 key1 := common.UpstreamBoundWorkerKeyAdapter.Encode(bound.Worker) 375 op1 := clientv3.OpPut(key1, value) 376 key2 := common.UpstreamLastBoundWorkerKeyAdapter.Encode(bound.Worker) 377 op2 := clientv3.OpPut(key2, value) 378 379 return []clientv3.Op{op1, op2}, nil 380 }