github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/pkg/ha/stage.go (about)

     1  // Copyright 2020 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package ha
    15  
    16  import (
    17  	"context"
    18  	"encoding/json"
    19  	"fmt"
    20  
    21  	"github.com/pingcap/tiflow/dm/common"
    22  	"github.com/pingcap/tiflow/dm/config"
    23  	"github.com/pingcap/tiflow/dm/pb"
    24  	"github.com/pingcap/tiflow/dm/pkg/etcdutil"
    25  	"github.com/pingcap/tiflow/dm/pkg/log"
    26  	"github.com/pingcap/tiflow/dm/pkg/terror"
    27  	"go.etcd.io/etcd/api/v3/mvccpb"
    28  	clientv3 "go.etcd.io/etcd/client/v3"
    29  	"go.uber.org/zap"
    30  )
    31  
    32  // Stage represents the running stage for a relay or subtask.
    33  type Stage struct {
    34  	Expect pb.Stage `json:"expect"`         // the expectant stage.
    35  	Source string   `json:"source"`         // the source ID of the upstream.
    36  	Task   string   `json:"task,omitempty"` // the task name for subtask; empty for relay.
    37  
    38  	// only used to report to the caller of the watcher, do not marsh it.
    39  	// if it's true, it means the stage has been deleted in etcd.
    40  	IsDeleted bool `json:"-"`
    41  	// record the etcd Revision of this Stage
    42  	Revision int64 `json:"-"`
    43  }
    44  
    45  // NewRelayStage creates a new Stage instance for relay.
    46  func NewRelayStage(expect pb.Stage, source string) Stage {
    47  	return newStage(expect, source, "")
    48  }
    49  
    50  // NewSubTaskStage creates a new Stage instance for subtask.
    51  func NewSubTaskStage(expect pb.Stage, source, task string) Stage {
    52  	return newStage(expect, source, task)
    53  }
    54  
    55  func NewValidatorStage(expect pb.Stage, source, task string) Stage {
    56  	return newStage(expect, source, task)
    57  }
    58  
    59  // newStage creates a new Stage instance.
    60  func newStage(expect pb.Stage, source, task string) Stage {
    61  	return Stage{
    62  		Expect: expect,
    63  		Source: source,
    64  		Task:   task,
    65  	}
    66  }
    67  
    68  // String implements Stringer interface.
    69  func (s Stage) String() string {
    70  	str, _ := s.toJSON()
    71  	return str
    72  }
    73  
    74  // toJSON returns the string of JSON represent.
    75  func (s Stage) toJSON() (string, error) {
    76  	data, err := json.Marshal(s)
    77  	if err != nil {
    78  		return "", terror.ErrHAInvalidItem.Delegate(err, fmt.Sprintf("failed to marshal stage %+v", s))
    79  	}
    80  	return string(data), nil
    81  }
    82  
    83  // IsEmpty returns true when this Stage has no value.
    84  func (s Stage) IsEmpty() bool {
    85  	var emptyStage Stage
    86  	return s == emptyStage
    87  }
    88  
    89  // stageFromJSON constructs Stage from its JSON represent.
    90  func stageFromJSON(str string) (s Stage, err error) {
    91  	if err = json.Unmarshal([]byte(str), &s); err != nil {
    92  		err = terror.ErrHAInvalidItem.Delegate(err, fmt.Sprintf("failed to unmarshal stage %s", str))
    93  	}
    94  	return
    95  }
    96  
    97  // PutRelayStage puts the stage of the relay into etcd.
    98  // k/v: sourceID -> the running stage of the relay.
    99  func PutRelayStage(cli *clientv3.Client, stages ...Stage) (int64, error) {
   100  	ops, err := putRelayStageOp(stages...)
   101  	if err != nil {
   102  		return 0, err
   103  	}
   104  	_, rev, err := etcdutil.DoTxnWithRepeatable(cli, etcdutil.ThenOpFunc(ops...))
   105  	return rev, err
   106  }
   107  
   108  // DeleteRelayStage deleted the relay stage of this source.
   109  func DeleteRelayStage(cli *clientv3.Client, source string) (int64, error) {
   110  	_, rev, err := etcdutil.DoTxnWithRepeatable(cli, etcdutil.ThenOpFunc(deleteRelayStageOp(source)))
   111  	return rev, err
   112  }
   113  
   114  // PutSubTaskStage puts the stage of the subtask into etcd.
   115  // k/v: sourceID, task -> the running stage of the subtask.
   116  func PutSubTaskStage(cli *clientv3.Client, stages ...Stage) (int64, error) {
   117  	ops, err := putSubTaskStageOp(stages...)
   118  	if err != nil {
   119  		return 0, err
   120  	}
   121  	_, rev, err := etcdutil.DoTxnWithRepeatable(cli, etcdutil.ThenOpFunc(ops...))
   122  	return rev, err
   123  }
   124  
   125  // GetRelayStage gets the relay stage for the specified upstream source.
   126  // if the stage for the source not exist, return with `err == nil` and `revision=0`.
   127  func GetRelayStage(cli *clientv3.Client, source string) (Stage, int64, error) {
   128  	ctx, cancel := context.WithTimeout(cli.Ctx(), etcdutil.DefaultRequestTimeout)
   129  	defer cancel()
   130  
   131  	var stage Stage
   132  	resp, err := cli.Get(ctx, common.StageRelayKeyAdapter.Encode(source))
   133  	if err != nil {
   134  		return stage, 0, terror.ErrHAFailTxnOperation.Delegate(err, "failed to get relay stage for source %s", source)
   135  	}
   136  
   137  	if resp.Count == 0 {
   138  		return stage, resp.Header.Revision, nil
   139  	} else if resp.Count > 1 {
   140  		// this should not happen.
   141  		return stage, 0, terror.ErrConfigMoreThanOne.Generate(resp.Count, "relay stage", "source: "+source)
   142  	}
   143  
   144  	stage, err = stageFromJSON(string(resp.Kvs[0].Value))
   145  	if err != nil {
   146  		return stage, 0, err
   147  	}
   148  	stage.Revision = resp.Kvs[0].ModRevision
   149  
   150  	return stage, resp.Header.Revision, nil
   151  }
   152  
   153  // GetAllRelayStage gets all relay stages.
   154  // k/v: source ID -> relay stage.
   155  func GetAllRelayStage(cli *clientv3.Client) (map[string]Stage, int64, error) {
   156  	ctx, cancel := context.WithTimeout(cli.Ctx(), etcdutil.DefaultRequestTimeout)
   157  	defer cancel()
   158  
   159  	resp, err := cli.Get(ctx, common.StageRelayKeyAdapter.Path(), clientv3.WithPrefix())
   160  	if err != nil {
   161  		return nil, 0, terror.ErrHAFailTxnOperation.Delegate(err, "failed to get all relay stages")
   162  	}
   163  
   164  	stages := make(map[string]Stage)
   165  	for _, kv := range resp.Kvs {
   166  		stage, err2 := stageFromJSON(string(kv.Value))
   167  		if err2 != nil {
   168  			return nil, 0, err2
   169  		}
   170  		stage.Revision = kv.ModRevision
   171  		stages[stage.Source] = stage
   172  	}
   173  	return stages, resp.Header.Revision, nil
   174  }
   175  
   176  // GetSubTaskStage gets the subtask stage for the specified upstream source and task name.
   177  // if the stage for the source and task name not exist, return with `err == nil` and `revision=0`.
   178  // if task name is "", it will return all subtasks' stage as a map{task-name: stage} for the source.
   179  // if task name is given, it will return a map{task-name: stage} whose length is 1.
   180  func GetSubTaskStage(cli *clientv3.Client, source, task string) (map[string]Stage, int64, error) {
   181  	return getStageByKey(cli, common.StageSubTaskKeyAdapter, source, task, 0)
   182  }
   183  
   184  func getStageByKey(cli *clientv3.Client, key common.KeyAdapter, source, task string, revision int64) (map[string]Stage, int64, error) {
   185  	ctx, cancel := context.WithTimeout(cli.Ctx(), etcdutil.DefaultRequestTimeout)
   186  	defer cancel()
   187  
   188  	var (
   189  		stm  = make(map[string]Stage)
   190  		resp *clientv3.GetResponse
   191  		err  error
   192  		opts = make([]clientv3.OpOption, 0)
   193  	)
   194  	if revision > 0 {
   195  		opts = append(opts, clientv3.WithRev(revision))
   196  	}
   197  	if task != "" {
   198  		resp, err = cli.Get(ctx, key.Encode(source, task), opts...)
   199  	} else {
   200  		opts = append(opts, clientv3.WithPrefix())
   201  		resp, err = cli.Get(ctx, key.Encode(source), opts...)
   202  	}
   203  
   204  	if err != nil {
   205  		return stm, 0, terror.ErrHAFailTxnOperation.Delegate(err, "failed to get subtask stage for source %s, task %s", source, task)
   206  	}
   207  
   208  	stages, err := getStagesFromResp(source, task, resp)
   209  	if err != nil {
   210  		return stm, 0, err
   211  	}
   212  	stm = stages[source]
   213  
   214  	return stm, resp.Header.Revision, nil
   215  }
   216  
   217  func GetValidatorStage(cli *clientv3.Client, source, task string, revision int64) (map[string]Stage, int64, error) {
   218  	return getStageByKey(cli, common.StageValidatorKeyAdapter, source, task, revision)
   219  }
   220  
   221  // GetAllSubTaskStage gets all subtask stages.
   222  // k/v: source ID -> task name -> subtask stage.
   223  func GetAllSubTaskStage(cli *clientv3.Client) (map[string]map[string]Stage, int64, error) {
   224  	return getAllStagesInner(cli, common.StageSubTaskKeyAdapter)
   225  }
   226  
   227  func getAllStagesInner(cli *clientv3.Client, key common.KeyAdapter) (map[string]map[string]Stage, int64, error) {
   228  	ctx, cancel := context.WithTimeout(cli.Ctx(), etcdutil.DefaultRequestTimeout)
   229  	defer cancel()
   230  
   231  	resp, err := cli.Get(ctx, key.Path(), clientv3.WithPrefix())
   232  	if err != nil {
   233  		return nil, 0, terror.ErrHAFailTxnOperation.Delegate(err, "failed to get all subtask stages")
   234  	}
   235  
   236  	stages, err := getStagesFromResp("", "", resp)
   237  	if err != nil {
   238  		return nil, 0, err
   239  	}
   240  
   241  	return stages, resp.Header.Revision, nil
   242  }
   243  
   244  func GetAllValidatorStage(cli *clientv3.Client) (map[string]map[string]Stage, int64, error) {
   245  	return getAllStagesInner(cli, common.StageValidatorKeyAdapter)
   246  }
   247  
   248  // GetSubTaskStageConfig gets source's subtask stages and configs at the same time
   249  // source **must not be empty**
   250  // return map{task name -> subtask stage}, map{task name -> validator stage}, map{task name -> subtask config}, revision, error.
   251  func GetSubTaskStageConfig(cli *clientv3.Client, source string) (map[string]Stage, map[string]Stage, map[string]config.SubTaskConfig, int64, error) {
   252  	var (
   253  		stm               = make(map[string]Stage)
   254  		validatorStageMap = make(map[string]Stage)
   255  		scm               = make(map[string]config.SubTaskConfig)
   256  	)
   257  	txnResp, rev, err := etcdutil.DoTxnWithRepeatable(cli, etcdutil.ThenOpFunc(
   258  		clientv3.OpGet(common.StageSubTaskKeyAdapter.Encode(source), clientv3.WithPrefix()),
   259  		clientv3.OpGet(common.StageValidatorKeyAdapter.Encode(source), clientv3.WithPrefix()),
   260  		clientv3.OpGet(common.UpstreamSubTaskKeyAdapter.Encode(source), clientv3.WithPrefix())))
   261  	if err != nil {
   262  		return stm, validatorStageMap, scm, 0, err
   263  	}
   264  	stageResp := txnResp.Responses[0].GetResponseRange()
   265  	stages, err := getStagesFromResp(source, "", (*clientv3.GetResponse)(stageResp))
   266  	if err != nil {
   267  		return stm, validatorStageMap, scm, 0, err
   268  	}
   269  	stm = stages[source]
   270  
   271  	validatorStageResp := txnResp.Responses[1].GetResponseRange()
   272  	validatorStages, err := getStagesFromResp(source, "", (*clientv3.GetResponse)(validatorStageResp))
   273  	if err != nil {
   274  		return stm, validatorStageMap, scm, 0, err
   275  	}
   276  	validatorStageMap = validatorStages[source]
   277  
   278  	cfgResp := txnResp.Responses[2].GetResponseRange()
   279  	cfgs, err := subTaskCfgFromResp(source, "", (*clientv3.GetResponse)(cfgResp))
   280  	if err != nil {
   281  		return stm, validatorStageMap, scm, 0, err
   282  	}
   283  	scm = cfgs[source]
   284  
   285  	return stm, validatorStageMap, scm, rev, err
   286  }
   287  
   288  // WatchRelayStage watches PUT & DELETE operations for the relay stage.
   289  // for the DELETE stage, it returns an empty stage.
   290  func WatchRelayStage(ctx context.Context, cli *clientv3.Client,
   291  	source string, revision int64, outCh chan<- Stage, errCh chan<- error,
   292  ) {
   293  	wCtx, cancel := context.WithCancel(ctx)
   294  	defer cancel()
   295  	ch := cli.Watch(wCtx, common.StageRelayKeyAdapter.Encode(source), clientv3.WithRev(revision))
   296  	watchStage(ctx, ch, relayStageFromKey, outCh, errCh)
   297  }
   298  
   299  // WatchSubTaskStage watches PUT & DELETE operations for the subtask stage.
   300  // for the DELETE stage, it returns an empty stage.
   301  func WatchSubTaskStage(ctx context.Context, cli *clientv3.Client,
   302  	source string, revision int64, outCh chan<- Stage, errCh chan<- error,
   303  ) {
   304  	wCtx, cancel := context.WithCancel(ctx)
   305  	defer cancel()
   306  	ch := cli.Watch(wCtx, common.StageSubTaskKeyAdapter.Encode(source), clientv3.WithPrefix(), clientv3.WithRev(revision))
   307  	watchStage(ctx, ch, subTaskStageFromKey, outCh, errCh)
   308  }
   309  
   310  func WatchValidatorStage(ctx context.Context, cli *clientv3.Client,
   311  	source string, rev int64, outCh chan<- Stage, errCh chan<- error,
   312  ) {
   313  	wCtx, cancel := context.WithCancel(ctx)
   314  	defer cancel()
   315  	ch := cli.Watch(wCtx, common.StageValidatorKeyAdapter.Encode(source), clientv3.WithPrefix(), clientv3.WithRev(rev))
   316  	watchStage(ctx, ch, validatorStageFromKey, outCh, errCh)
   317  }
   318  
   319  // DeleteSubTaskStage deletes the subtask stage.
   320  func DeleteSubTaskStage(cli *clientv3.Client, stages ...Stage) (int64, error) {
   321  	ops := deleteSubTaskStageOp(stages...)
   322  	_, rev, err := etcdutil.DoTxnWithRepeatable(cli, etcdutil.ThenOpFunc(ops...))
   323  	return rev, err
   324  }
   325  
   326  // relayStageFromKey constructs an incomplete relay stage from an etcd key.
   327  func relayStageFromKey(key string) (Stage, error) {
   328  	var stage Stage
   329  	ks, err := common.StageRelayKeyAdapter.Decode(key)
   330  	if err != nil {
   331  		return stage, err
   332  	}
   333  	stage.Source = ks[0]
   334  	return stage, nil
   335  }
   336  
   337  // subTaskStageFromKey constructs an incomplete subtask stage from an etcd key.
   338  func subTaskStageFromKey(key string) (Stage, error) {
   339  	var stage Stage
   340  	ks, err := common.StageSubTaskKeyAdapter.Decode(key)
   341  	if err != nil {
   342  		return stage, err
   343  	}
   344  	stage.Source = ks[0]
   345  	stage.Task = ks[1]
   346  	return stage, nil
   347  }
   348  
   349  func validatorStageFromKey(key string) (Stage, error) {
   350  	var stage Stage
   351  	ks, err := common.StageValidatorKeyAdapter.Decode(key)
   352  	if err != nil {
   353  		return stage, err
   354  	}
   355  	stage.Source = ks[0]
   356  	stage.Task = ks[1]
   357  	return stage, nil
   358  }
   359  
   360  func getStagesFromResp(source, task string, resp *clientv3.GetResponse) (map[string]map[string]Stage, error) {
   361  	stages := make(map[string]map[string]Stage)
   362  	if source != "" {
   363  		stages[source] = make(map[string]Stage) // avoid stages[source] is nil
   364  	}
   365  
   366  	if resp.Count == 0 {
   367  		return stages, nil
   368  	} else if source != "" && task != "" && resp.Count > 1 {
   369  		// this should not happen.
   370  		return stages, terror.ErrConfigMoreThanOne.Generate(resp.Count, "stage", "(source "+source+", task "+task+")")
   371  	}
   372  
   373  	for _, kvs := range resp.Kvs {
   374  		stage, err := stageFromJSON(string(kvs.Value))
   375  		if err != nil {
   376  			return nil, err
   377  		}
   378  		if _, ok := stages[stage.Source]; !ok {
   379  			stages[stage.Source] = make(map[string]Stage)
   380  		}
   381  		stage.Revision = kvs.ModRevision
   382  		stages[stage.Source][stage.Task] = stage
   383  	}
   384  	return stages, nil
   385  }
   386  
   387  // watchStage watches PUT & DELETE operations for the stage.
   388  // nolint:dupl
   389  func watchStage(ctx context.Context, watchCh clientv3.WatchChan,
   390  	stageFromKey func(key string) (Stage, error), outCh chan<- Stage, errCh chan<- error,
   391  ) {
   392  	for {
   393  		select {
   394  		case <-ctx.Done():
   395  			return
   396  		case resp, ok := <-watchCh:
   397  			if !ok {
   398  				return
   399  			}
   400  			if resp.Canceled {
   401  				// TODO(csuzhangxc): do retry here.
   402  				if resp.Err() != nil {
   403  					select {
   404  					case errCh <- terror.ErrHAFailWatchEtcd.Delegate(resp.Err(), "watch stage canceled"):
   405  					case <-ctx.Done():
   406  					}
   407  				}
   408  				return
   409  			}
   410  
   411  			for _, ev := range resp.Events {
   412  				var (
   413  					stage Stage
   414  					err   error
   415  				)
   416  				switch ev.Type {
   417  				case mvccpb.PUT:
   418  					stage, err = stageFromJSON(string(ev.Kv.Value))
   419  				case mvccpb.DELETE:
   420  					stage, err = stageFromKey(string(ev.Kv.Key))
   421  					stage.IsDeleted = true
   422  				default:
   423  					// this should not happen.
   424  					log.L().Error("unsupported etcd event type", zap.Reflect("kv", ev.Kv), zap.Reflect("type", ev.Type))
   425  					continue
   426  				}
   427  				stage.Revision = ev.Kv.ModRevision
   428  
   429  				if err != nil {
   430  					select {
   431  					case errCh <- err:
   432  					case <-ctx.Done():
   433  						return
   434  					}
   435  				} else {
   436  					select {
   437  					case outCh <- stage:
   438  					case <-ctx.Done():
   439  						return
   440  					}
   441  				}
   442  			}
   443  		}
   444  	}
   445  }
   446  
   447  // putRelayStageOp returns a list of PUT etcd operation for the relay stage.
   448  // k/v: sourceID -> the running stage of the relay.
   449  func putRelayStageOp(stages ...Stage) ([]clientv3.Op, error) {
   450  	ops := make([]clientv3.Op, 0, len(stages))
   451  	for _, stage := range stages {
   452  		value, err := stage.toJSON()
   453  		if err != nil {
   454  			return ops, err
   455  		}
   456  		key := common.StageRelayKeyAdapter.Encode(stage.Source)
   457  		ops = append(ops, clientv3.OpPut(key, value))
   458  	}
   459  	return ops, nil
   460  }
   461  
   462  // putSubTaskStageOp returns a list of PUT etcd operations for the subtask stage.
   463  // k/v: sourceID, task -> the running stage of the subtask.
   464  func putSubTaskStageOp(stages ...Stage) ([]clientv3.Op, error) {
   465  	ops := make([]clientv3.Op, 0, len(stages))
   466  	for _, stage := range stages {
   467  		value, err := stage.toJSON()
   468  		if err != nil {
   469  			return ops, err
   470  		}
   471  		key := common.StageSubTaskKeyAdapter.Encode(stage.Source, stage.Task)
   472  		ops = append(ops, clientv3.OpPut(key, value))
   473  	}
   474  	return ops, nil
   475  }
   476  
   477  // deleteRelayStageOp returns a DELETE etcd operation for the relay stage.
   478  func deleteRelayStageOp(source string) clientv3.Op {
   479  	return clientv3.OpDelete(common.StageRelayKeyAdapter.Encode(source))
   480  }
   481  
   482  // deleteSubTaskStageOp returns a list of DELETE etcd operation for the subtask stage.
   483  func deleteSubTaskStageOp(stages ...Stage) []clientv3.Op {
   484  	ops := make([]clientv3.Op, 0, len(stages))
   485  	for _, stage := range stages {
   486  		ops = append(ops, clientv3.OpDelete(common.StageSubTaskKeyAdapter.Encode(stage.Source, stage.Task)))
   487  	}
   488  	return ops
   489  }