github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/master/etcd.go (about) 1 // Copyright 2019 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package master 15 16 import ( 17 "fmt" 18 "net/http" 19 "os" 20 "path/filepath" 21 "strings" 22 "time" 23 24 toolutils "github.com/pingcap/tidb-tools/pkg/utils" 25 "github.com/pingcap/tiflow/dm/pkg/etcdutil" 26 "github.com/pingcap/tiflow/dm/pkg/log" 27 "github.com/pingcap/tiflow/dm/pkg/terror" 28 "go.etcd.io/etcd/server/v3/embed" 29 "go.uber.org/zap" 30 "google.golang.org/grpc" 31 ) 32 33 const ( 34 // time waiting for etcd to be started. 35 etcdStartTimeout = time.Minute 36 // privateDirMode grants owner to make/remove files inside the directory. 37 privateDirMode os.FileMode = 0o700 38 ) 39 40 // startEtcd starts an embedded etcd server. 41 func startEtcd(etcdCfg *embed.Config, 42 gRPCSvr func(*grpc.Server), 43 httpHandles map[string]http.Handler, startTimeout time.Duration, 44 ) (*embed.Etcd, error) { 45 // attach extra gRPC and HTTP server 46 if gRPCSvr != nil { 47 etcdCfg.ServiceRegister = gRPCSvr 48 } 49 if httpHandles != nil { 50 etcdCfg.UserHandlers = httpHandles 51 } 52 53 e, err := embed.StartEtcd(etcdCfg) 54 if err != nil { 55 return nil, terror.ErrMasterStartEmbedEtcdFail.Delegate(err) 56 } 57 58 select { 59 case <-e.Server.ReadyNotify(): 60 case <-time.After(startTimeout): 61 // if fail to startup, the etcd server may be still blocking in 62 // https://github.com/etcd-io/etcd/blob/3cf2f69b5738fb702ba1a935590f36b52b18979b/embed/serve.go#L92 63 // then `e.Close` will block in 64 // https://github.com/etcd-io/etcd/blob/3cf2f69b5738fb702ba1a935590f36b52b18979b/embed/etcd.go#L377 65 // because `close(sctx.serversC)` has not been called in 66 // https://github.com/etcd-io/etcd/blob/3cf2f69b5738fb702ba1a935590f36b52b18979b/embed/serve.go#L200. 67 // so for `ReadyNotify` timeout, we choose to only call `e.Server.Stop()` now, 68 // and we should exit the DM-master process after returned with error from this function. 69 e.Server.Stop() 70 return nil, terror.ErrMasterStartEmbedEtcdFail.Generatef("start embed etcd timeout %v", startTimeout) 71 } 72 return e, nil 73 } 74 75 // prepareJoinEtcd prepares config needed to join an existing cluster. 76 // learn from https://github.com/pingcap/pd/blob/37efcb05f397f26c70cda8dd44acaa3061c92159/server/join/join.go#L44. 77 // 78 // when setting `initial-cluster` explicitly to bootstrap a new cluster: 79 // - if local persistent data exist, just restart the previous cluster (in fact, it's not bootstrapping). 80 // - if local persistent data not exist, just bootstrap the cluster as a new cluster. 81 // 82 // when setting `join` to join an existing cluster (without `initial-cluster` set): 83 // - if local persistent data exists (in fact, it's not join): 84 // - just restart if `member` already exists (already joined before) 85 // - read `initial-cluster` back from local persistent data to restart (just like bootstrapping) 86 // 87 // - if local persistent data not exist: 88 // 1. fetch member list from the cluster to check if we can join now. 89 // 2. call `member add` to add the member info into the cluster. 90 // 3. generate config for join (`initial-cluster` and `initial-cluster-state`). 91 // 4. save `initial-cluster` in local persistent data for later restarting. 92 // 93 // NOTE: A member can't join to another cluster after it has joined a previous one. 94 func prepareJoinEtcd(cfg *Config) error { 95 // no need to join 96 if cfg.Join == "" { 97 return nil 98 } 99 100 // try to join self, invalid 101 if cfg.Join == cfg.AdvertiseAddr { 102 return terror.ErrMasterJoinEmbedEtcdFail.Generate(fmt.Sprintf("join self %s is forbidden", cfg.Join)) 103 } 104 105 // restart with previous data, no `InitialCluster` need to set 106 // ref: https://github.com/etcd-io/etcd/blob/ae9734ed278b7a1a7dfc82e800471ebbf9fce56f/etcdserver/server.go#L313 107 if isDirExist(filepath.Join(cfg.DataDir, "member", "wal")) { 108 cfg.InitialCluster = "" 109 cfg.InitialClusterState = embed.ClusterStateFlagExisting 110 return nil 111 } 112 113 // join with persistent data 114 joinFP := filepath.Join(cfg.DataDir, "join") 115 if s, err := os.ReadFile(joinFP); err != nil { 116 if !os.IsNotExist(err) { 117 return terror.ErrMasterJoinEmbedEtcdFail.Delegate(err, "read persistent join data") 118 } 119 } else { 120 cfg.InitialCluster = strings.TrimSpace(string(s)) 121 cfg.InitialClusterState = embed.ClusterStateFlagExisting 122 log.L().Info("using persistent join data", zap.String("file", joinFP), zap.String("data", cfg.InitialCluster)) 123 return nil 124 } 125 126 tlsCfg, err := toolutils.ToTLSConfig(cfg.SSLCA, cfg.SSLCert, cfg.SSLKey) 127 if err != nil { 128 return terror.ErrMasterJoinEmbedEtcdFail.Delegate(err, "generate tls config") 129 } 130 131 // if without previous data, we need a client to contact with the existing cluster. 132 client, err := etcdutil.CreateClient(strings.Split(cfg.Join, ","), tlsCfg) 133 if err != nil { 134 return terror.ErrMasterJoinEmbedEtcdFail.Delegate(err, fmt.Sprintf("create etcd client for %s", cfg.Join)) 135 } 136 defer client.Close() 137 138 // `member list` 139 listResp, err := etcdutil.ListMembers(client) 140 if err != nil { 141 return terror.ErrMasterJoinEmbedEtcdFail.Delegate(err, fmt.Sprintf("list member for %s", cfg.Join)) 142 } 143 144 // check members 145 for _, m := range listResp.Members { 146 if m.Name == "" { // the previous existing member without name (not complete the join operation) 147 // we can't generate `initial-cluster` correctly with empty member name, 148 // and if added a member but not started it to complete the join, 149 // the later join operation may encounter `etcdserver: re-configuration failed due to not enough started members`. 150 return terror.ErrMasterJoinEmbedEtcdFail.Generate("there is a member that has not joined successfully, continue the join or remove it") 151 } 152 if m.Name == cfg.Name { 153 // a failed DM-master re-joins the previous cluster. 154 return terror.ErrMasterJoinEmbedEtcdFail.Generate(fmt.Sprintf("missing data or joining a duplicate member %s", m.Name)) 155 } 156 } 157 158 // `member add`, a new/deleted DM-master joins to an existing cluster. 159 addResp, err := etcdutil.AddMember(client, strings.Split(cfg.AdvertisePeerUrls, ",")) 160 if err != nil { 161 return terror.ErrMasterJoinEmbedEtcdFail.Delegate(err, fmt.Sprintf("add member %s", cfg.AdvertisePeerUrls)) 162 } 163 164 // generate `--initial-cluster` 165 ms := make([]string, 0, len(addResp.Members)) 166 for _, m := range addResp.Members { 167 name := m.Name 168 if m.ID == addResp.Member.ID { 169 // the member only called `member add`, 170 // but has not started the process to complete the join should have an empty name. 171 // so, we use the `name` in config instead. 172 name = cfg.Name 173 } 174 if name == "" { 175 // this should be checked in the previous `member list` operation if having only one member is join. 176 // if multi join operations exist, the behavior may be unexpected. 177 // check again here only to decrease the unexpectedness. 178 return terror.ErrMasterJoinEmbedEtcdFail.Generate("there is a member that has not joined successfully, continue the join or remove it") 179 } 180 for _, url := range m.PeerURLs { 181 ms = append(ms, fmt.Sprintf("%s=%s", name, url)) 182 } 183 } 184 cfg.InitialCluster = strings.Join(ms, ",") 185 cfg.InitialClusterState = embed.ClusterStateFlagExisting 186 187 // save `--initial-cluster` in persist data 188 if err = os.MkdirAll(cfg.DataDir, privateDirMode); err != nil && !os.IsExist(err) { 189 return terror.ErrMasterJoinEmbedEtcdFail.Delegate(err, "make directory") 190 } 191 if err = os.WriteFile(joinFP, []byte(cfg.InitialCluster), privateDirMode); err != nil { 192 return terror.ErrMasterJoinEmbedEtcdFail.Delegate(err, "write persistent join data") 193 } 194 195 return nil 196 } 197 198 // isDirExist returns whether the directory is exist. 199 func isDirExist(d string) bool { 200 if stat, err := os.Stat(d); err == nil && stat.IsDir() { 201 return true 202 } 203 return false 204 }