github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/pkg/ha/bound.go (about)

     1  // Copyright 2020 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package ha
    15  
    16  import (
    17  	"context"
    18  	"encoding/json"
    19  	"fmt"
    20  	"time"
    21  
    22  	"github.com/pingcap/failpoint"
    23  	"github.com/pingcap/tiflow/dm/common"
    24  	"github.com/pingcap/tiflow/dm/config"
    25  	"github.com/pingcap/tiflow/dm/pkg/etcdutil"
    26  	"github.com/pingcap/tiflow/dm/pkg/log"
    27  	"github.com/pingcap/tiflow/dm/pkg/terror"
    28  	"go.etcd.io/etcd/api/v3/mvccpb"
    29  	clientv3 "go.etcd.io/etcd/client/v3"
    30  	"go.uber.org/zap"
    31  )
    32  
    33  const (
    34  	// we need two steps to get a name/id and query config using that id.
    35  	// since above steps can't be put into one etcd transaction, we combine and re-run the first step into the second
    36  	// step, and check the name/id is still valid. if not valid, retry the second step using new name/id.
    37  	defaultGetSourceBoundConfigRetry = 3
    38  	defaultGetRelayConfigRetry       = 3
    39  	retryInterval                    = 50 * time.Millisecond // retry interval when we get two different bounds
    40  )
    41  
    42  // SourceBound represents the bound relationship between the DM-worker instance and the upstream MySQL source.
    43  type SourceBound struct {
    44  	Source string `json:"source"` // the source ID of the upstream.
    45  	Worker string `json:"worker"` // the name of the bound DM-worker for the source.
    46  
    47  	// only used to report to the caller of the watcher, do not marsh it.
    48  	// if it's true, it means the bound has been deleted in etcd.
    49  	IsDeleted bool `json:"-"`
    50  	// record the etcd Revision of this bound
    51  	Revision int64 `json:"-"`
    52  }
    53  
    54  // NewSourceBound creates a new SourceBound instance.
    55  func NewSourceBound(source, worker string) SourceBound {
    56  	return SourceBound{
    57  		Source: source,
    58  		Worker: worker,
    59  	}
    60  }
    61  
    62  // String implements Stringer interface.
    63  func (b SourceBound) String() string {
    64  	s, _ := b.toJSON()
    65  	return s
    66  }
    67  
    68  // toJSON returns the string of JSON represent.
    69  func (b SourceBound) toJSON() (string, error) {
    70  	data, err := json.Marshal(b)
    71  	if err != nil {
    72  		return "", terror.ErrHAInvalidItem.Delegate(err, fmt.Sprintf("fail to marshal SourceBound %+v", b))
    73  	}
    74  	return string(data), nil
    75  }
    76  
    77  // IsEmpty returns true when this bound has no value.
    78  func (b SourceBound) IsEmpty() bool {
    79  	var emptyBound SourceBound
    80  	return b == emptyBound
    81  }
    82  
    83  // sourceBoundFromJSON constructs SourceBound from its JSON represent.
    84  func sourceBoundFromJSON(s string) (b SourceBound, err error) {
    85  	if err = json.Unmarshal([]byte(s), &b); err != nil {
    86  		err = terror.ErrHAInvalidItem.Delegate(err, fmt.Sprintf("fail to unmarshal SourceBound %s", s))
    87  	}
    88  	return
    89  }
    90  
    91  // PutSourceBound puts the bound relationship into etcd.
    92  // k/v: worker-name -> bound relationship.
    93  func PutSourceBound(cli *clientv3.Client, bounds ...SourceBound) (int64, error) {
    94  	ops := make([]clientv3.Op, 0, len(bounds))
    95  	for _, bound := range bounds {
    96  		boundOps, err := putSourceBoundOp(bound)
    97  		if err != nil {
    98  			return 0, err
    99  		}
   100  		ops = append(ops, boundOps...)
   101  	}
   102  	_, rev, err := etcdutil.DoTxnWithRepeatable(cli, etcdutil.ThenOpFunc(ops...))
   103  	return rev, err
   104  }
   105  
   106  // DeleteSourceBound deletes the bound relationship in etcd for the specified worker.
   107  func DeleteSourceBound(cli *clientv3.Client, workers ...string) (int64, error) {
   108  	ops := make([]clientv3.Op, 0, len(workers))
   109  	for _, worker := range workers {
   110  		ops = append(ops, deleteSourceBoundOp(worker)...)
   111  	}
   112  	_, rev, err := etcdutil.DoTxnWithRepeatable(cli, etcdutil.ThenOpFunc(ops...))
   113  	return rev, err
   114  }
   115  
   116  // ReplaceSourceBound deletes an old bound and puts a new bound in one transaction, so a bound source will not become
   117  // unbound because of failing halfway.
   118  func ReplaceSourceBound(cli *clientv3.Client, source, oldWorker, newWorker string) (int64, error) {
   119  	deleteOps := deleteSourceBoundOp(oldWorker)
   120  	putOps, err := putSourceBoundOp(NewSourceBound(source, newWorker))
   121  	if err != nil {
   122  		return 0, err
   123  	}
   124  	ops := make([]clientv3.Op, 0, len(deleteOps)+len(putOps))
   125  	ops = append(ops, deleteOps...)
   126  	ops = append(ops, putOps...)
   127  	_, rev, err := etcdutil.DoTxnWithRepeatable(cli, etcdutil.ThenOpFunc(ops...))
   128  	return rev, err
   129  }
   130  
   131  // GetSourceBound gets the source bound relationship for the specified DM-worker.
   132  // if the bound relationship for the worker name not exist, return with `err == nil`.
   133  // if the worker name is "", it will return all bound relationships as a map{worker-name: bound}.
   134  // if the worker name is given, it will return a map{worker-name: bound} whose length is 1.
   135  func GetSourceBound(cli *clientv3.Client, worker string) (map[string]SourceBound, int64, error) {
   136  	ctx, cancel := context.WithTimeout(cli.Ctx(), etcdutil.DefaultRequestTimeout)
   137  	defer cancel()
   138  
   139  	var (
   140  		sbm  = make(map[string]SourceBound)
   141  		resp *clientv3.GetResponse
   142  		err  error
   143  	)
   144  	failpoint.Inject("FailToGetSourceCfg", func() {
   145  		failpoint.Return(sbm, 0, context.DeadlineExceeded)
   146  	})
   147  	if worker != "" {
   148  		resp, err = cli.Get(ctx, common.UpstreamBoundWorkerKeyAdapter.Encode(worker))
   149  	} else {
   150  		resp, err = cli.Get(ctx, common.UpstreamBoundWorkerKeyAdapter.Path(), clientv3.WithPrefix())
   151  	}
   152  
   153  	if err != nil {
   154  		return sbm, 0, terror.ErrHAFailTxnOperation.Delegate(err, "fail to get bound relationship")
   155  	}
   156  
   157  	sbm, err = sourceBoundFromResp(worker, resp)
   158  	if err != nil {
   159  		return sbm, 0, err
   160  	}
   161  
   162  	return sbm, resp.Header.Revision, nil
   163  }
   164  
   165  // GetLastSourceBounds gets all last source bound relationship. Different with GetSourceBound, "last source bound" will
   166  // not be deleted when worker offline.
   167  func GetLastSourceBounds(cli *clientv3.Client) (map[string]SourceBound, int64, error) {
   168  	ctx, cancel := context.WithTimeout(cli.Ctx(), etcdutil.DefaultRequestTimeout)
   169  	defer cancel()
   170  
   171  	sbm := make(map[string]SourceBound)
   172  	resp, err := cli.Get(ctx, common.UpstreamLastBoundWorkerKeyAdapter.Path(), clientv3.WithPrefix())
   173  	if err != nil {
   174  		return sbm, 0, terror.ErrHAFailTxnOperation.Delegate(err, "fail to get last bound relationship")
   175  	}
   176  
   177  	sbm, err = sourceBoundFromResp("", resp)
   178  	if err != nil {
   179  		return sbm, 0, err
   180  	}
   181  
   182  	return sbm, resp.Header.Revision, nil
   183  }
   184  
   185  // GetSourceBoundConfig gets the source bound relationship and relative source config at the same time
   186  // for the specified DM-worker. The index worker **must not be empty**:
   187  // if source bound is empty, will return an empty sourceBound and an empty source config
   188  // if source bound is not empty but sourceConfig is empty, will return an error
   189  // if the source bound is different for over retryNum times, will return an error.
   190  func GetSourceBoundConfig(cli *clientv3.Client, worker string) (SourceBound, *config.SourceConfig, int64, error) {
   191  	var (
   192  		bound    SourceBound
   193  		newBound SourceBound
   194  		cfg      *config.SourceConfig
   195  		ok       bool
   196  		retryNum = defaultGetSourceBoundConfigRetry
   197  	)
   198  	sbm, rev, err := GetSourceBound(cli, worker)
   199  	if err != nil {
   200  		return bound, cfg, 0, err
   201  	}
   202  	if bound, ok = sbm[worker]; !ok {
   203  		return bound, cfg, rev, nil
   204  	}
   205  
   206  	for retryCnt := 1; retryCnt <= retryNum; retryCnt++ {
   207  		txnResp, rev2, err2 := etcdutil.DoTxnWithRepeatable(cli, etcdutil.ThenOpFunc(clientv3.OpGet(common.UpstreamBoundWorkerKeyAdapter.Encode(worker)),
   208  			clientv3.OpGet(common.UpstreamConfigKeyAdapter.Encode(bound.Source))))
   209  		if err2 != nil {
   210  			return bound, cfg, 0, err2
   211  		}
   212  
   213  		boundResp := txnResp.Responses[0].GetResponseRange()
   214  		sbm2, err2 := sourceBoundFromResp(worker, (*clientv3.GetResponse)(boundResp))
   215  		if err2 != nil {
   216  			return bound, cfg, 0, err2
   217  		}
   218  
   219  		newBound, ok = sbm2[worker]
   220  		// when ok is false, newBound will be empty which means bound for this worker has been deleted in this turn
   221  		// if bound is not empty, we should wait for another turn to make sure bound is really deleted.
   222  		if newBound != bound {
   223  			log.L().Warn("source bound has been changed, will take a retry", zap.Stringer("oldBound", bound),
   224  				zap.Stringer("newBound", newBound), zap.Int("retryTime", retryCnt))
   225  			// if we are about to fail, don't update bound to save the last bound to error
   226  			if retryCnt != retryNum {
   227  				bound = newBound
   228  			}
   229  			select {
   230  			case <-cli.Ctx().Done():
   231  				retryNum = 0 // stop retry
   232  			case <-time.After(retryInterval):
   233  				// retryInterval shouldn't be too long because the longer we wait, bound is more
   234  				// possible to be different from newBound
   235  			}
   236  			continue
   237  		}
   238  		// ok == false and newBound == bound means this bound is truly deleted, we don't need source config anymore
   239  		if !ok {
   240  			return bound, cfg, rev2, nil
   241  		}
   242  
   243  		cfgResp := txnResp.Responses[1].GetResponseRange()
   244  		scm, err2 := sourceCfgFromResp(bound.Source, (*clientv3.GetResponse)(cfgResp))
   245  		if err2 != nil {
   246  			return bound, cfg, 0, err2
   247  		}
   248  		cfg, ok = scm[bound.Source]
   249  		// ok == false means we have got source bound but there is no source config, this shouldn't happen
   250  		if !ok {
   251  			// this should not happen.
   252  			return bound, cfg, 0, terror.ErrConfigMissingForBound.Generate(bound)
   253  		}
   254  
   255  		return bound, cfg, rev2, nil
   256  	}
   257  
   258  	return bound, cfg, 0, terror.ErrMasterBoundChanging.Generate(bound, newBound)
   259  }
   260  
   261  // WatchSourceBound watches PUT & DELETE operations for the bound relationship of the specified DM-worker.
   262  // For the DELETE operations, it returns an empty bound relationship.
   263  // nolint:dupl
   264  func WatchSourceBound(ctx context.Context, cli *clientv3.Client, worker string, revision int64, outCh chan<- SourceBound, errCh chan<- error) {
   265  	wCtx, cancel := context.WithCancel(ctx)
   266  	defer cancel()
   267  	ch := cli.Watch(wCtx, common.UpstreamBoundWorkerKeyAdapter.Encode(worker), clientv3.WithRev(revision))
   268  
   269  	for {
   270  		select {
   271  		case <-ctx.Done():
   272  			return
   273  		case resp, ok := <-ch:
   274  			if !ok {
   275  				return
   276  			}
   277  			if resp.Canceled {
   278  				// TODO(csuzhangxc): do retry here.
   279  				if resp.Err() != nil {
   280  					select {
   281  					case errCh <- terror.ErrHAFailWatchEtcd.Delegate(resp.Err(), "watch source bound key canceled"):
   282  					case <-ctx.Done():
   283  					}
   284  				}
   285  				return
   286  			}
   287  
   288  			for _, ev := range resp.Events {
   289  				var (
   290  					bound SourceBound
   291  					err   error
   292  				)
   293  				switch ev.Type {
   294  				case mvccpb.PUT:
   295  					bound, err = sourceBoundFromJSON(string(ev.Kv.Value))
   296  				case mvccpb.DELETE:
   297  					bound, err = sourceBoundFromKey(string(ev.Kv.Key))
   298  					bound.IsDeleted = true
   299  				default:
   300  					// this should not happen.
   301  					log.L().Error("unsupported etcd event type", zap.Reflect("kv", ev.Kv), zap.Reflect("type", ev.Type))
   302  					continue
   303  				}
   304  				bound.Revision = ev.Kv.ModRevision
   305  
   306  				if err != nil {
   307  					select {
   308  					case errCh <- err:
   309  					case <-ctx.Done():
   310  						return
   311  					}
   312  				} else {
   313  					select {
   314  					case outCh <- bound:
   315  					case <-ctx.Done():
   316  						return
   317  					}
   318  				}
   319  			}
   320  		}
   321  	}
   322  }
   323  
   324  // sourceBoundFromKey constructs an incomplete bound relationship from an etcd key.
   325  func sourceBoundFromKey(key string) (SourceBound, error) {
   326  	var bound SourceBound
   327  	ks, err := common.UpstreamBoundWorkerKeyAdapter.Decode(key)
   328  	if err != nil {
   329  		return bound, err
   330  	}
   331  	bound.Worker = ks[0]
   332  	return bound, nil
   333  }
   334  
   335  func sourceBoundFromResp(worker string, resp *clientv3.GetResponse) (map[string]SourceBound, error) {
   336  	sbm := make(map[string]SourceBound)
   337  	if resp.Count == 0 {
   338  		return sbm, nil
   339  	} else if worker != "" && resp.Count > 1 {
   340  		// this should not happen.
   341  		return sbm, terror.ErrConfigMoreThanOne.Generate(resp.Count, "bound relationship", "worker: "+worker)
   342  	}
   343  
   344  	for _, kvs := range resp.Kvs {
   345  		bound, err := sourceBoundFromJSON(string(kvs.Value))
   346  		if err != nil {
   347  			return sbm, err
   348  		}
   349  		bound.Revision = kvs.ModRevision
   350  		sbm[bound.Worker] = bound
   351  	}
   352  	return sbm, nil
   353  }
   354  
   355  // deleteSourceBoundOp returns a DELETE etcd operation for the bound relationship of the specified DM-worker.
   356  func deleteSourceBoundOp(worker string) []clientv3.Op {
   357  	return []clientv3.Op{
   358  		clientv3.OpDelete(common.UpstreamBoundWorkerKeyAdapter.Encode(worker)),
   359  	}
   360  }
   361  
   362  // deleteLastSourceBoundOp returns a DELETE etcd operation for the last bound relationship of the specified DM-worker.
   363  func deleteLastSourceBoundOp(worker string) clientv3.Op {
   364  	return clientv3.OpDelete(common.UpstreamLastBoundWorkerKeyAdapter.Encode(worker))
   365  }
   366  
   367  // putSourceBoundOp returns PUT etcd operations for the bound relationship.
   368  // k/v: worker-name -> bound relationship.
   369  func putSourceBoundOp(bound SourceBound) ([]clientv3.Op, error) {
   370  	value, err := bound.toJSON()
   371  	if err != nil {
   372  		return []clientv3.Op{}, err
   373  	}
   374  	key1 := common.UpstreamBoundWorkerKeyAdapter.Encode(bound.Worker)
   375  	op1 := clientv3.OpPut(key1, value)
   376  	key2 := common.UpstreamLastBoundWorkerKeyAdapter.Encode(bound.Worker)
   377  	op2 := clientv3.OpPut(key2, value)
   378  
   379  	return []clientv3.Op{op1, op2}, nil
   380  }