github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/pkg/ha/relay.go (about)

     1  // Copyright 2021 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package ha
    15  
    16  import (
    17  	"context"
    18  	"fmt"
    19  	"time"
    20  
    21  	"github.com/pingcap/tiflow/dm/common"
    22  	"github.com/pingcap/tiflow/dm/config"
    23  	"github.com/pingcap/tiflow/dm/pkg/etcdutil"
    24  	"github.com/pingcap/tiflow/dm/pkg/log"
    25  	"github.com/pingcap/tiflow/dm/pkg/terror"
    26  	"go.etcd.io/etcd/api/v3/mvccpb"
    27  	clientv3 "go.etcd.io/etcd/client/v3"
    28  	"go.uber.org/zap"
    29  )
    30  
    31  // RelaySource represents the bound relationship between the DM-worker instance and its upstream relay source.
    32  type RelaySource struct {
    33  	Source string
    34  	// only used to report to the caller of the watcher, do not marsh it.
    35  	// if it's true, it means the bound has been deleted in etcd.
    36  	IsDeleted bool
    37  	// record the etcd ModRevision of this bound
    38  	Revision int64
    39  }
    40  
    41  // PutRelayConfig puts the relay config for given workers.
    42  // k/v: worker-name -> source-id.
    43  // TODO: let caller wait until worker has enabled relay.
    44  func PutRelayConfig(cli *clientv3.Client, source string, workers ...string) (int64, error) {
    45  	ops := make([]clientv3.Op, 0, len(workers))
    46  	for _, worker := range workers {
    47  		ops = append(ops, putRelayConfigOp(worker, source))
    48  	}
    49  	_, rev, err := etcdutil.DoTxnWithRepeatable(cli, etcdutil.ThenOpFunc(ops...))
    50  	return rev, err
    51  }
    52  
    53  // DeleteRelayConfig deletes the relay config for given workers.
    54  func DeleteRelayConfig(cli *clientv3.Client, workers ...string) (int64, error) {
    55  	ops := make([]clientv3.Op, 0, len(workers))
    56  	for _, worker := range workers {
    57  		ops = append(ops, deleteRelayConfigOp(worker))
    58  	}
    59  	_, rev, err := etcdutil.DoTxnWithRepeatable(cli, etcdutil.ThenOpFunc(ops...))
    60  	return rev, err
    61  }
    62  
    63  // GetAllRelayConfig gets all source and its relay worker.
    64  // k/v: source ID -> set(workers).
    65  func GetAllRelayConfig(cli *clientv3.Client) (map[string]map[string]struct{}, int64, error) {
    66  	ctx, cancel := context.WithTimeout(cli.Ctx(), etcdutil.DefaultRequestTimeout)
    67  	defer cancel()
    68  
    69  	resp, err := cli.Get(ctx, common.UpstreamRelayWorkerKeyAdapter.Path(), clientv3.WithPrefix())
    70  	if err != nil {
    71  		return nil, 0, terror.ErrHAFailTxnOperation.Delegate(err, "fail to get all relay config")
    72  	}
    73  
    74  	ret := map[string]map[string]struct{}{}
    75  	for _, kv := range resp.Kvs {
    76  		source := string(kv.Value)
    77  		keys, err2 := common.UpstreamRelayWorkerKeyAdapter.Decode(string(kv.Key))
    78  		if err2 != nil {
    79  			return nil, 0, err2
    80  		}
    81  		if len(keys) != 1 {
    82  			// should not happened
    83  			return nil, 0, terror.Annotate(err, "illegal key of UpstreamRelayWorkerKeyAdapter")
    84  		}
    85  		worker := keys[0]
    86  		var (
    87  			ok      bool
    88  			workers map[string]struct{}
    89  		)
    90  		if workers, ok = ret[source]; !ok {
    91  			workers = map[string]struct{}{}
    92  			ret[source] = workers
    93  		}
    94  		workers[worker] = struct{}{}
    95  	}
    96  	return ret, resp.Header.Revision, nil
    97  }
    98  
    99  // GetRelayConfig returns the source config which the given worker need to pull relay log from etcd, with revision.
   100  func GetRelayConfig(cli *clientv3.Client, worker string) (*config.SourceConfig, int64, error) {
   101  	var (
   102  		source    string
   103  		newSource string
   104  		rev       int64
   105  		retryNum  = defaultGetRelayConfigRetry
   106  	)
   107  	ctx, cancel := context.WithTimeout(cli.Ctx(), etcdutil.DefaultRequestTimeout)
   108  	defer cancel()
   109  
   110  	getSourceIDFromResp := func(resp *clientv3.GetResponse) (string, int64, error) {
   111  		if resp.Count == 0 {
   112  			return "", resp.Header.Revision, nil
   113  		}
   114  		if resp.Count > 1 {
   115  			return "", resp.Header.Revision, terror.ErrConfigMoreThanOne.Generate(resp.Count, "relay relationship", "worker: "+worker)
   116  		}
   117  		return string(resp.Kvs[0].Value), resp.Header.Revision, nil
   118  	}
   119  
   120  	resp, err := cli.Get(ctx, common.UpstreamRelayWorkerKeyAdapter.Encode(worker))
   121  	if err != nil {
   122  		return nil, 0, terror.ErrHAFailTxnOperation.Delegate(err, "fail to get relay config")
   123  	}
   124  	source, rev, err = getSourceIDFromResp(resp)
   125  	if err != nil || source == "" {
   126  		return nil, rev, err
   127  	}
   128  
   129  	for retryCnt := 1; retryCnt <= retryNum; retryCnt++ {
   130  		txnResp, _, err2 := etcdutil.DoTxnWithRepeatable(cli, etcdutil.ThenOpFunc(
   131  			clientv3.OpGet(common.UpstreamRelayWorkerKeyAdapter.Encode(worker)),
   132  			clientv3.OpGet(common.UpstreamConfigKeyAdapter.Encode(source))))
   133  		if err2 != nil {
   134  			return nil, 0, err
   135  		}
   136  
   137  		var rev2 int64
   138  		sourceResp := txnResp.Responses[0].GetResponseRange()
   139  		newSource, rev2, err = getSourceIDFromResp((*clientv3.GetResponse)(sourceResp))
   140  		if err != nil {
   141  			return nil, 0, err
   142  		}
   143  
   144  		if newSource != source {
   145  			log.L().Warn("relay config has been changed, will take a retry",
   146  				zap.String("old relay source", source),
   147  				zap.String("new relay source", newSource),
   148  				zap.Int("retryTime", retryCnt))
   149  			// if we are about to fail, don't update relay source to save the last source to error
   150  			if retryCnt != retryNum {
   151  				source = newSource
   152  			}
   153  			select {
   154  			case <-cli.Ctx().Done():
   155  				retryNum = 0 // stop retry
   156  			case <-time.After(retryInterval):
   157  				// retryInterval shouldn't be too long because the longer we wait, bound is more
   158  				// possible to be different from newBound
   159  			}
   160  			continue
   161  		}
   162  		// newSource == source == "" means this relay source is truly deleted
   163  		if newSource == "" {
   164  			return nil, rev2, nil
   165  		}
   166  
   167  		cfgResp := txnResp.Responses[1].GetResponseRange()
   168  		scm, err3 := sourceCfgFromResp(newSource, (*clientv3.GetResponse)(cfgResp))
   169  		if err3 != nil {
   170  			return nil, 0, err3
   171  		}
   172  		cfg, ok := scm[newSource]
   173  		// ok == false means we have got relay source but there is no source config, this shouldn't happen
   174  		if !ok {
   175  			// this should not happen.
   176  			return nil, 0, terror.ErrConfigMissingForBound.Generate(source)
   177  		}
   178  
   179  		return cfg, rev2, nil
   180  	}
   181  	return nil, 0, terror.ErrWorkerRelayConfigChanging.Generate(worker, source, newSource)
   182  }
   183  
   184  // putRelayConfigOp returns PUT etcd operations for the relay relationship of the specified DM-worker.
   185  // k/v: worker-name -> source-id.
   186  func putRelayConfigOp(worker, source string) clientv3.Op {
   187  	return clientv3.OpPut(common.UpstreamRelayWorkerKeyAdapter.Encode(worker), source)
   188  }
   189  
   190  // deleteRelayConfigOp returns a DELETE etcd operation for the relay relationship of the specified DM-worker.
   191  func deleteRelayConfigOp(worker string) clientv3.Op {
   192  	return clientv3.OpDelete(common.UpstreamRelayWorkerKeyAdapter.Encode(worker))
   193  }
   194  
   195  // WatchRelayConfig watches PUT & DELETE operations for the relay relationship of the specified DM-worker.
   196  // For the DELETE operations, it returns an nil source config.
   197  func WatchRelayConfig(ctx context.Context, cli *clientv3.Client,
   198  	worker string, revision int64, outCh chan<- RelaySource, errCh chan<- error,
   199  ) {
   200  	wCtx, cancel := context.WithCancel(ctx)
   201  	defer cancel()
   202  	ch := cli.Watch(wCtx, common.UpstreamRelayWorkerKeyAdapter.Encode(worker), clientv3.WithRev(revision))
   203  
   204  	for {
   205  		select {
   206  		case <-ctx.Done():
   207  			return
   208  		case resp, ok := <-ch:
   209  			if !ok {
   210  				return
   211  			}
   212  			if resp.Canceled {
   213  				// TODO(csuzhangxc): do retry here.
   214  				if resp.Err() != nil {
   215  					select {
   216  					case errCh <- terror.ErrHAFailWatchEtcd.Delegate(resp.Err(), fmt.Sprintf("watch relay config canceled, worker %s", worker)):
   217  					case <-ctx.Done():
   218  					}
   219  				}
   220  				return
   221  			}
   222  
   223  			for _, ev := range resp.Events {
   224  				var bound RelaySource
   225  				switch ev.Type {
   226  				case mvccpb.PUT:
   227  					bound.Source = string(ev.Kv.Value)
   228  					bound.IsDeleted = false
   229  				case mvccpb.DELETE:
   230  					bound.IsDeleted = true
   231  				default:
   232  					// this should not happen.
   233  					log.L().Error("unsupported etcd event type", zap.Reflect("kv", ev.Kv), zap.Reflect("type", ev.Type))
   234  					continue
   235  				}
   236  				bound.Revision = ev.Kv.ModRevision
   237  
   238  				select {
   239  				case outCh <- bound:
   240  				case <-ctx.Done():
   241  					return
   242  				}
   243  			}
   244  		}
   245  	}
   246  }