github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/pkg/etcdutil/etcdutil.go (about)

     1  // Copyright 2019 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  // learn from https://github.com/pingcap/pd/blob/v3.0.5/pkg/etcdutil/etcdutil.go.
    15  
    16  package etcdutil
    17  
    18  import (
    19  	"context"
    20  	"crypto/tls"
    21  	"time"
    22  
    23  	"github.com/pingcap/errors"
    24  	"github.com/pingcap/failpoint"
    25  	tcontext "github.com/pingcap/tiflow/dm/pkg/context"
    26  	"github.com/pingcap/tiflow/dm/pkg/log"
    27  	"github.com/pingcap/tiflow/dm/pkg/retry"
    28  	"github.com/pingcap/tiflow/dm/pkg/terror"
    29  	"github.com/pingcap/tiflow/pkg/errorutil"
    30  	v3rpc "go.etcd.io/etcd/api/v3/v3rpc/rpctypes"
    31  	clientv3 "go.etcd.io/etcd/client/v3"
    32  	"go.uber.org/zap"
    33  	"google.golang.org/grpc/codes"
    34  	"google.golang.org/grpc/status"
    35  )
    36  
    37  const (
    38  	// DefaultDialTimeout is the maximum amount of time a dial will wait for a
    39  	// connection to setup. 30s is long enough for most of the network conditions.
    40  	DefaultDialTimeout = 30 * time.Second
    41  
    42  	// DefaultRequestTimeout 10s is long enough for most of etcd clusters.
    43  	DefaultRequestTimeout = 10 * time.Second
    44  
    45  	// DefaultRevokeLeaseTimeout is the maximum amount of time waiting for revoke etcd lease.
    46  	DefaultRevokeLeaseTimeout = 3 * time.Second
    47  )
    48  
    49  var etcdDefaultTxnRetryParam = retry.Params{
    50  	RetryCount:         5,
    51  	FirstRetryDuration: time.Second,
    52  	BackoffStrategy:    retry.Stable,
    53  	IsRetryableFn: func(retryTime int, err error) bool {
    54  		return errorutil.IsRetryableEtcdError(err)
    55  	},
    56  }
    57  
    58  var etcdDefaultTxnStrategy = retry.FiniteRetryStrategy{}
    59  
    60  // CreateClient creates an etcd client with some default config items.
    61  func CreateClient(endpoints []string, tlsCfg *tls.Config) (*clientv3.Client, error) {
    62  	return clientv3.New(clientv3.Config{
    63  		Endpoints:        endpoints,
    64  		DialTimeout:      DefaultDialTimeout,
    65  		AutoSyncInterval: 30 * time.Second,
    66  		TLS:              tlsCfg,
    67  	})
    68  }
    69  
    70  // ListMembers returns a list of internal etcd members.
    71  func ListMembers(client *clientv3.Client) (*clientv3.MemberListResponse, error) {
    72  	ctx, cancel := context.WithTimeout(client.Ctx(), DefaultRequestTimeout)
    73  	defer cancel()
    74  	return client.MemberList(ctx)
    75  }
    76  
    77  // AddMember adds an etcd member.
    78  func AddMember(client *clientv3.Client, peerAddrs []string) (*clientv3.MemberAddResponse, error) {
    79  	ctx, cancel := context.WithTimeout(client.Ctx(), DefaultRequestTimeout)
    80  	defer cancel()
    81  	return client.MemberAdd(ctx, peerAddrs)
    82  }
    83  
    84  // RemoveMember removes an etcd member by the given id.
    85  func RemoveMember(client *clientv3.Client, id uint64) (*clientv3.MemberRemoveResponse, error) {
    86  	ctx, cancel := context.WithTimeout(client.Ctx(), DefaultRequestTimeout)
    87  	defer cancel()
    88  	return client.MemberRemove(ctx, id)
    89  }
    90  
    91  type EtcdOpFunc func(*tcontext.Context, *clientv3.Client) (interface{}, error)
    92  
    93  // DoTxnWithRepeatable do multiple etcd operations in one txn with repeatable retry.
    94  // There are two situations that this function can be used:
    95  // 1. The operations are all read operations.
    96  // 2. The operations are all write operations, but write operations tolerate being written to etcd ** at least once **.
    97  // TODO: add unit test to test encountered an retryable error first but then recovered.
    98  func DoTxnWithRepeatable(cli *clientv3.Client, opFunc EtcdOpFunc) (*clientv3.TxnResponse, int64, error) {
    99  	ctx, cancel := context.WithTimeout(cli.Ctx(), DefaultRequestTimeout)
   100  	defer cancel()
   101  	tctx := tcontext.NewContext(ctx, log.L())
   102  
   103  	ret, _, err := etcdDefaultTxnStrategy.Apply(tctx, etcdDefaultTxnRetryParam, func(t *tcontext.Context) (interface{}, error) {
   104  		return opFunc(t, cli)
   105  	})
   106  	if err != nil {
   107  		return nil, 0, err
   108  	}
   109  	if resp, ok := ret.(*clientv3.TxnResponse); ok {
   110  		return resp, resp.Header.Revision, nil
   111  	}
   112  	return nil, 0, nil
   113  }
   114  
   115  func ThenOpFunc(ops ...clientv3.Op) EtcdOpFunc {
   116  	return func(tctx *tcontext.Context, cli *clientv3.Client) (interface{}, error) {
   117  		resp, err := cli.Txn(tctx.Ctx).Then(ops...).Commit()
   118  		if err != nil {
   119  			return nil, terror.ErrHAFailTxnOperation.Delegate(err, "txn commit failed")
   120  		}
   121  		return resp, nil
   122  	}
   123  }
   124  
   125  func FullOpFunc(cmps []clientv3.Cmp, opsThen, opsElse []clientv3.Op) EtcdOpFunc {
   126  	return func(tctx *tcontext.Context, cli *clientv3.Client) (interface{}, error) {
   127  		failpoint.Inject("ErrNoSpace", func() {
   128  			tctx.L().Info("fail to do ops in etcd", zap.String("failpoint", "ErrNoSpace"))
   129  			failpoint.Return(nil, v3rpc.ErrNoSpace)
   130  		})
   131  		resp, err := cli.Txn(tctx.Ctx).If(cmps...).Then(opsThen...).Else(opsElse...).Commit()
   132  		if err != nil {
   133  			return nil, terror.ErrHAFailTxnOperation.Delegate(err, "txn commit failed")
   134  		}
   135  		return resp, nil
   136  	}
   137  }
   138  
   139  // IsRetryableError returns true if the etcd error is retryable to write ** repeatable **.
   140  // https://github.com/etcd-io/etcd/blob/v3.5.2/client/v3/retry.go#L53
   141  func IsRetryableError(err error) bool {
   142  	err = errors.Cause(err)
   143  	switch err {
   144  	case v3rpc.ErrCompacted, v3rpc.ErrNoLeader, v3rpc.ErrNoSpace, context.DeadlineExceeded:
   145  		return true
   146  	}
   147  	eErr := v3rpc.Error(err)
   148  	if serverErr, ok := eErr.(v3rpc.EtcdError); ok && serverErr.Code() != codes.Unavailable {
   149  		return false
   150  	}
   151  	// only retry if unavailable
   152  	return status.Code(err) == codes.Unavailable
   153  }
   154  
   155  // IsLimitedRetryableError check whether error is retryable error for etcd to build again in a limited number of times.
   156  func IsLimitedRetryableError(err error) bool {
   157  	switch errors.Cause(err) {
   158  	case v3rpc.ErrNoSpace, context.DeadlineExceeded:
   159  		return true
   160  	default:
   161  		return false
   162  	}
   163  }