github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/pkg/etcdutil/etcdutil.go (about) 1 // Copyright 2019 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 // learn from https://github.com/pingcap/pd/blob/v3.0.5/pkg/etcdutil/etcdutil.go. 15 16 package etcdutil 17 18 import ( 19 "context" 20 "crypto/tls" 21 "time" 22 23 "github.com/pingcap/errors" 24 "github.com/pingcap/failpoint" 25 tcontext "github.com/pingcap/tiflow/dm/pkg/context" 26 "github.com/pingcap/tiflow/dm/pkg/log" 27 "github.com/pingcap/tiflow/dm/pkg/retry" 28 "github.com/pingcap/tiflow/dm/pkg/terror" 29 "github.com/pingcap/tiflow/pkg/errorutil" 30 v3rpc "go.etcd.io/etcd/api/v3/v3rpc/rpctypes" 31 clientv3 "go.etcd.io/etcd/client/v3" 32 "go.uber.org/zap" 33 "google.golang.org/grpc/codes" 34 "google.golang.org/grpc/status" 35 ) 36 37 const ( 38 // DefaultDialTimeout is the maximum amount of time a dial will wait for a 39 // connection to setup. 30s is long enough for most of the network conditions. 40 DefaultDialTimeout = 30 * time.Second 41 42 // DefaultRequestTimeout 10s is long enough for most of etcd clusters. 43 DefaultRequestTimeout = 10 * time.Second 44 45 // DefaultRevokeLeaseTimeout is the maximum amount of time waiting for revoke etcd lease. 46 DefaultRevokeLeaseTimeout = 3 * time.Second 47 ) 48 49 var etcdDefaultTxnRetryParam = retry.Params{ 50 RetryCount: 5, 51 FirstRetryDuration: time.Second, 52 BackoffStrategy: retry.Stable, 53 IsRetryableFn: func(retryTime int, err error) bool { 54 return errorutil.IsRetryableEtcdError(err) 55 }, 56 } 57 58 var etcdDefaultTxnStrategy = retry.FiniteRetryStrategy{} 59 60 // CreateClient creates an etcd client with some default config items. 61 func CreateClient(endpoints []string, tlsCfg *tls.Config) (*clientv3.Client, error) { 62 return clientv3.New(clientv3.Config{ 63 Endpoints: endpoints, 64 DialTimeout: DefaultDialTimeout, 65 AutoSyncInterval: 30 * time.Second, 66 TLS: tlsCfg, 67 }) 68 } 69 70 // ListMembers returns a list of internal etcd members. 71 func ListMembers(client *clientv3.Client) (*clientv3.MemberListResponse, error) { 72 ctx, cancel := context.WithTimeout(client.Ctx(), DefaultRequestTimeout) 73 defer cancel() 74 return client.MemberList(ctx) 75 } 76 77 // AddMember adds an etcd member. 78 func AddMember(client *clientv3.Client, peerAddrs []string) (*clientv3.MemberAddResponse, error) { 79 ctx, cancel := context.WithTimeout(client.Ctx(), DefaultRequestTimeout) 80 defer cancel() 81 return client.MemberAdd(ctx, peerAddrs) 82 } 83 84 // RemoveMember removes an etcd member by the given id. 85 func RemoveMember(client *clientv3.Client, id uint64) (*clientv3.MemberRemoveResponse, error) { 86 ctx, cancel := context.WithTimeout(client.Ctx(), DefaultRequestTimeout) 87 defer cancel() 88 return client.MemberRemove(ctx, id) 89 } 90 91 type EtcdOpFunc func(*tcontext.Context, *clientv3.Client) (interface{}, error) 92 93 // DoTxnWithRepeatable do multiple etcd operations in one txn with repeatable retry. 94 // There are two situations that this function can be used: 95 // 1. The operations are all read operations. 96 // 2. The operations are all write operations, but write operations tolerate being written to etcd ** at least once **. 97 // TODO: add unit test to test encountered an retryable error first but then recovered. 98 func DoTxnWithRepeatable(cli *clientv3.Client, opFunc EtcdOpFunc) (*clientv3.TxnResponse, int64, error) { 99 ctx, cancel := context.WithTimeout(cli.Ctx(), DefaultRequestTimeout) 100 defer cancel() 101 tctx := tcontext.NewContext(ctx, log.L()) 102 103 ret, _, err := etcdDefaultTxnStrategy.Apply(tctx, etcdDefaultTxnRetryParam, func(t *tcontext.Context) (interface{}, error) { 104 return opFunc(t, cli) 105 }) 106 if err != nil { 107 return nil, 0, err 108 } 109 if resp, ok := ret.(*clientv3.TxnResponse); ok { 110 return resp, resp.Header.Revision, nil 111 } 112 return nil, 0, nil 113 } 114 115 func ThenOpFunc(ops ...clientv3.Op) EtcdOpFunc { 116 return func(tctx *tcontext.Context, cli *clientv3.Client) (interface{}, error) { 117 resp, err := cli.Txn(tctx.Ctx).Then(ops...).Commit() 118 if err != nil { 119 return nil, terror.ErrHAFailTxnOperation.Delegate(err, "txn commit failed") 120 } 121 return resp, nil 122 } 123 } 124 125 func FullOpFunc(cmps []clientv3.Cmp, opsThen, opsElse []clientv3.Op) EtcdOpFunc { 126 return func(tctx *tcontext.Context, cli *clientv3.Client) (interface{}, error) { 127 failpoint.Inject("ErrNoSpace", func() { 128 tctx.L().Info("fail to do ops in etcd", zap.String("failpoint", "ErrNoSpace")) 129 failpoint.Return(nil, v3rpc.ErrNoSpace) 130 }) 131 resp, err := cli.Txn(tctx.Ctx).If(cmps...).Then(opsThen...).Else(opsElse...).Commit() 132 if err != nil { 133 return nil, terror.ErrHAFailTxnOperation.Delegate(err, "txn commit failed") 134 } 135 return resp, nil 136 } 137 } 138 139 // IsRetryableError returns true if the etcd error is retryable to write ** repeatable **. 140 // https://github.com/etcd-io/etcd/blob/v3.5.2/client/v3/retry.go#L53 141 func IsRetryableError(err error) bool { 142 err = errors.Cause(err) 143 switch err { 144 case v3rpc.ErrCompacted, v3rpc.ErrNoLeader, v3rpc.ErrNoSpace, context.DeadlineExceeded: 145 return true 146 } 147 eErr := v3rpc.Error(err) 148 if serverErr, ok := eErr.(v3rpc.EtcdError); ok && serverErr.Code() != codes.Unavailable { 149 return false 150 } 151 // only retry if unavailable 152 return status.Code(err) == codes.Unavailable 153 } 154 155 // IsLimitedRetryableError check whether error is retryable error for etcd to build again in a limited number of times. 156 func IsLimitedRetryableError(err error) bool { 157 switch errors.Cause(err) { 158 case v3rpc.ErrNoSpace, context.DeadlineExceeded: 159 return true 160 default: 161 return false 162 } 163 }