github.com/pingcap/chaos@v0.0.0-20190710112158-c86faf4b3719/db/cluster/cluster.go (about) 1 package cluster 2 3 import ( 4 "context" 5 "fmt" 6 "log" 7 "path" 8 "strconv" 9 "strings" 10 "sync" 11 "time" 12 13 "github.com/pingcap/chaos/pkg/util" 14 "github.com/pingcap/chaos/pkg/util/ssh" 15 ) 16 17 const ( 18 archiveURL = "http://download.pingcap.org/tidb-latest-linux-amd64.tar.gz" 19 deployDir = "/opt/tidb" 20 21 waitPDCount = 10 22 ) 23 24 var ( 25 pdBinary = path.Join(deployDir, "./bin/pd-server") 26 tikvBinary = path.Join(deployDir, "./bin/tikv-server") 27 tidbBinary = path.Join(deployDir, "./bin/tidb-server") 28 29 pdConfig = path.Join(deployDir, "./conf/pd.toml") 30 tikvConfig = path.Join(deployDir, "./conf/tikv.toml") 31 tidbConfig = path.Join(deployDir, "./conf/tidb.toml") 32 33 pdLog = path.Join(deployDir, "./log/pd.log") 34 tikvLog = path.Join(deployDir, "./log/tikv.log") 35 tidbLog = path.Join(deployDir, "./log/tidb.log") 36 ) 37 38 // Cluster is the TiKV/TiDB database cluster. 39 // Note: Cluster does not implement `core.DB` interface. 40 type Cluster struct { 41 once sync.Once 42 nodes []string 43 installBlocker util.BlockRunner 44 IncludeTidb bool 45 } 46 47 // SetUp initializes the database. 48 func (cluster *Cluster) SetUp(ctx context.Context, nodes []string, node string) error { 49 // Try kill all old servers 50 if cluster.IncludeTidb { 51 ssh.Exec(ctx, node, "killall", "-9", "tidb-server") 52 } 53 ssh.Exec(ctx, node, "killall", "-9", "tikv-server") 54 ssh.Exec(ctx, node, "killall", "-9", "pd-server") 55 56 cluster.once.Do(func() { 57 cluster.nodes = nodes 58 cluster.installBlocker.Init(len(nodes)) 59 }) 60 61 log.Printf("install archieve on node %s", node) 62 63 var err error 64 cluster.installBlocker.Run(func() { 65 if !util.IsFileExist(ctx, node, deployDir) { 66 err = util.InstallArchive(ctx, node, archiveURL, deployDir) 67 } 68 }) 69 if err != nil { 70 return err 71 } 72 73 util.Mkdir(ctx, node, path.Join(deployDir, "conf")) 74 util.Mkdir(ctx, node, path.Join(deployDir, "log")) 75 76 pdCfs := []string{ 77 "tick-interval=\"100ms\"", 78 "election-interval=\"500ms\"", 79 "tso-save-interval=\"500ms\"", 80 "[replication]", 81 "max-replicas=3", 82 } 83 84 if err := util.WriteFile(ctx, node, pdConfig, strconv.Quote(strings.Join(pdCfs, "\n"))); err != nil { 85 return err 86 } 87 88 tikvCfs := []string{ 89 "[server]", 90 "status-addr=\"0.0.0.0:20180\"", 91 "[raftstore]", 92 "capacity =\"100G\"", 93 "pd-heartbeat-tick-interval=\"3s\"", 94 "raft_store_max_leader_lease=\"50ms\"", 95 "raft_base_tick_interval=\"100ms\"", 96 "raft_heartbeat_ticks=3", 97 "raft_election_timeout_ticks=10", 98 "sync-log = true", 99 "[coprocessor]", 100 "region-max-keys = 5", 101 "region-split-keys = 2", 102 } 103 104 if err := util.WriteFile(ctx, node, tikvConfig, strconv.Quote(strings.Join(tikvCfs, "\n"))); err != nil { 105 return err 106 } 107 108 tidbCfs := []string{ 109 "lease = \"1s\"", 110 "split-table = true", 111 "[tikv-client]", 112 "commit-timeout = \"10ms\"", 113 "max-txn-time-use = 590", 114 } 115 116 if err := util.WriteFile(ctx, node, tidbConfig, strconv.Quote(strings.Join(tidbCfs, "\n"))); err != nil { 117 return err 118 } 119 120 return cluster.start(ctx, node, true) 121 } 122 123 // TearDown tears down the database. 124 func (cluster *Cluster) TearDown(ctx context.Context, nodes []string, node string) error { 125 return cluster.Kill(ctx, node) 126 } 127 128 // Start starts the database 129 func (cluster *Cluster) Start(ctx context.Context, node string) error { 130 return cluster.start(ctx, node, false) 131 } 132 133 func (cluster *Cluster) start(ctx context.Context, node string, inSetUp bool) error { 134 log.Printf("start database on node %s", node) 135 136 initialClusterArgs := make([]string, len(cluster.nodes)) 137 for i, n := range cluster.nodes { 138 initialClusterArgs[i] = fmt.Sprintf("%s=http://%s:2380", n, n) 139 } 140 pdArgs := []string{ 141 fmt.Sprintf("--name=%s", node), 142 "--data-dir=pd", 143 "--client-urls=http://0.0.0.0:2379", 144 "--peer-urls=http://0.0.0.0:2380", 145 fmt.Sprintf("--advertise-client-urls=http://%s:2379", node), 146 fmt.Sprintf("--advertise-peer-urls=http://%s:2380", node), 147 fmt.Sprintf("--initial-cluster=%s", strings.Join(initialClusterArgs, ",")), 148 fmt.Sprintf("--log-file=%s", pdLog), 149 fmt.Sprintf("--config=%s", pdConfig), 150 } 151 152 log.Printf("start pd-server on node %s", node) 153 pdPID := path.Join(deployDir, "pd.pid") 154 opts := util.NewDaemonOptions(deployDir, pdPID) 155 if err := util.StartDaemon(ctx, node, opts, pdBinary, pdArgs...); err != nil { 156 return err 157 } 158 159 if inSetUp { 160 time.Sleep(5 * time.Second) 161 } 162 163 if !util.IsDaemonRunning(ctx, node, pdBinary, pdPID) { 164 return fmt.Errorf("fail to start pd on node %s", node) 165 } 166 167 pdEndpoints := make([]string, len(cluster.nodes)) 168 for i, n := range cluster.nodes { 169 pdEndpoints[i] = fmt.Sprintf("%s:2379", n) 170 } 171 172 // Before starting TiKV, we should ensure PD cluster is ready. 173 WAIT: 174 for i := 0; i < waitPDCount; i++ { 175 for _, ep := range pdEndpoints { 176 // Member API works when PD cluster is ready. 177 memberAPI := fmt.Sprintf("%s/pd/api/v1/members", ep) 178 // `--fail`, non-zero exit code on server errors. 179 _, err := ssh.CombinedOutput(ctx, node, "curl", "--fail", memberAPI) 180 if err == nil { 181 log.Println("PD cluster is ready") 182 break WAIT 183 } 184 } 185 log.Println("waiting PD cluster...") 186 time.Sleep(1 * time.Second) 187 } 188 189 tikvArgs := []string{ 190 fmt.Sprintf("--pd=%s", strings.Join(pdEndpoints, ",")), 191 "--addr=0.0.0.0:20160", 192 fmt.Sprintf("--advertise-addr=%s:20160", node), 193 "--data-dir=tikv", 194 fmt.Sprintf("--log-file=%s", tikvLog), 195 fmt.Sprintf("--config=%s", tikvConfig), 196 } 197 198 log.Printf("start tikv-server on node %s", node) 199 tikvPID := path.Join(deployDir, "tikv.pid") 200 opts = util.NewDaemonOptions(deployDir, tikvPID) 201 if err := util.StartDaemon(ctx, node, opts, tikvBinary, tikvArgs...); err != nil { 202 return err 203 } 204 205 if !util.IsDaemonRunning(ctx, node, tikvBinary, tikvPID) { 206 return fmt.Errorf("fail to start tikv on node %s", node) 207 } 208 209 if cluster.IncludeTidb { 210 tidbArgs := []string{ 211 "--store=tikv", 212 fmt.Sprintf("--path=%s", strings.Join(pdEndpoints, ",")), 213 fmt.Sprintf("--log-file=%s", tidbLog), 214 fmt.Sprintf("--config=%s", tidbConfig), 215 } 216 217 log.Printf("start tidb-server on node %s", node) 218 tidbPID := path.Join(deployDir, "tidb.pid") 219 opts = util.NewDaemonOptions(deployDir, tidbPID) 220 if err := util.StartDaemon(ctx, node, opts, tidbBinary, tidbArgs...); err != nil { 221 return err 222 } 223 224 var err error 225 if inSetUp { 226 for i := 0; i < 12; i++ { 227 if err = ssh.Exec(ctx, node, "curl", fmt.Sprintf("http://%s:10080/status", node)); err == nil { 228 break 229 } 230 log.Printf("try to wait tidb run on %s", node) 231 time.Sleep(10 * time.Second) 232 } 233 } 234 235 if err != nil { 236 return err 237 } 238 239 if !util.IsDaemonRunning(ctx, node, tidbBinary, tidbPID) { 240 return fmt.Errorf("fail to start tidb on node %s", node) 241 } 242 } 243 244 return nil 245 } 246 247 // Stop stops the database 248 func (cluster *Cluster) Stop(ctx context.Context, node string) error { 249 if cluster.IncludeTidb { 250 if err := util.StopDaemon(ctx, node, tidbBinary, path.Join(deployDir, "tidb.pid")); err != nil { 251 return err 252 } 253 } 254 255 if err := util.StopDaemon(ctx, node, tikvBinary, path.Join(deployDir, "tikv.pid")); err != nil { 256 return err 257 } 258 259 return util.StopDaemon(ctx, node, pdBinary, path.Join(deployDir, "pd.pid")) 260 } 261 262 // Kill kills the database 263 func (cluster *Cluster) Kill(ctx context.Context, node string) error { 264 if cluster.IncludeTidb { 265 if err := util.KillDaemon(ctx, node, tidbBinary, path.Join(deployDir, "tidb.pid")); err != nil { 266 return err 267 } 268 } 269 270 if err := util.KillDaemon(ctx, node, tikvBinary, path.Join(deployDir, "tikv.pid")); err != nil { 271 return err 272 } 273 274 return util.KillDaemon(ctx, node, pdBinary, path.Join(deployDir, "pd.pid")) 275 } 276 277 // IsRunning checks whether the database is running or not 278 func (cluster *Cluster) IsRunning(ctx context.Context, node string) bool { 279 if cluster.IncludeTidb { 280 return util.IsDaemonRunning(ctx, node, tidbBinary, path.Join(deployDir, "tidb.pid")) 281 } 282 return util.IsDaemonRunning(ctx, node, tidbBinary, path.Join(deployDir, "tikv.pid")) 283 } 284 285 // Name returns the unique name for the database 286 func (cluster *Cluster) Name() string { 287 return "cluster" 288 }