github.com/pingcap/chaos@v0.0.0-20190710112158-c86faf4b3719/db/cluster/cluster.go (about)

     1  package cluster
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"log"
     7  	"path"
     8  	"strconv"
     9  	"strings"
    10  	"sync"
    11  	"time"
    12  
    13  	"github.com/pingcap/chaos/pkg/util"
    14  	"github.com/pingcap/chaos/pkg/util/ssh"
    15  )
    16  
    17  const (
    18  	archiveURL = "http://download.pingcap.org/tidb-latest-linux-amd64.tar.gz"
    19  	deployDir  = "/opt/tidb"
    20  
    21  	waitPDCount = 10
    22  )
    23  
    24  var (
    25  	pdBinary   = path.Join(deployDir, "./bin/pd-server")
    26  	tikvBinary = path.Join(deployDir, "./bin/tikv-server")
    27  	tidbBinary = path.Join(deployDir, "./bin/tidb-server")
    28  
    29  	pdConfig   = path.Join(deployDir, "./conf/pd.toml")
    30  	tikvConfig = path.Join(deployDir, "./conf/tikv.toml")
    31  	tidbConfig = path.Join(deployDir, "./conf/tidb.toml")
    32  
    33  	pdLog   = path.Join(deployDir, "./log/pd.log")
    34  	tikvLog = path.Join(deployDir, "./log/tikv.log")
    35  	tidbLog = path.Join(deployDir, "./log/tidb.log")
    36  )
    37  
    38  // Cluster is the TiKV/TiDB database cluster.
    39  // Note: Cluster does not implement `core.DB` interface.
    40  type Cluster struct {
    41  	once           sync.Once
    42  	nodes          []string
    43  	installBlocker util.BlockRunner
    44  	IncludeTidb    bool
    45  }
    46  
    47  // SetUp initializes the database.
    48  func (cluster *Cluster) SetUp(ctx context.Context, nodes []string, node string) error {
    49  	// Try kill all old servers
    50  	if cluster.IncludeTidb {
    51  		ssh.Exec(ctx, node, "killall", "-9", "tidb-server")
    52  	}
    53  	ssh.Exec(ctx, node, "killall", "-9", "tikv-server")
    54  	ssh.Exec(ctx, node, "killall", "-9", "pd-server")
    55  
    56  	cluster.once.Do(func() {
    57  		cluster.nodes = nodes
    58  		cluster.installBlocker.Init(len(nodes))
    59  	})
    60  
    61  	log.Printf("install archieve on node %s", node)
    62  
    63  	var err error
    64  	cluster.installBlocker.Run(func() {
    65  		if !util.IsFileExist(ctx, node, deployDir) {
    66  			err = util.InstallArchive(ctx, node, archiveURL, deployDir)
    67  		}
    68  	})
    69  	if err != nil {
    70  		return err
    71  	}
    72  
    73  	util.Mkdir(ctx, node, path.Join(deployDir, "conf"))
    74  	util.Mkdir(ctx, node, path.Join(deployDir, "log"))
    75  
    76  	pdCfs := []string{
    77  		"tick-interval=\"100ms\"",
    78  		"election-interval=\"500ms\"",
    79  		"tso-save-interval=\"500ms\"",
    80  		"[replication]",
    81  		"max-replicas=3",
    82  	}
    83  
    84  	if err := util.WriteFile(ctx, node, pdConfig, strconv.Quote(strings.Join(pdCfs, "\n"))); err != nil {
    85  		return err
    86  	}
    87  
    88  	tikvCfs := []string{
    89  		"[server]",
    90  		"status-addr=\"0.0.0.0:20180\"",
    91  		"[raftstore]",
    92  		"capacity =\"100G\"",
    93  		"pd-heartbeat-tick-interval=\"3s\"",
    94  		"raft_store_max_leader_lease=\"50ms\"",
    95  		"raft_base_tick_interval=\"100ms\"",
    96  		"raft_heartbeat_ticks=3",
    97  		"raft_election_timeout_ticks=10",
    98  		"sync-log = true",
    99  		"[coprocessor]",
   100  		"region-max-keys = 5",
   101  		"region-split-keys = 2",
   102  	}
   103  
   104  	if err := util.WriteFile(ctx, node, tikvConfig, strconv.Quote(strings.Join(tikvCfs, "\n"))); err != nil {
   105  		return err
   106  	}
   107  
   108  	tidbCfs := []string{
   109  		"lease = \"1s\"",
   110  		"split-table = true",
   111  		"[tikv-client]",
   112  		"commit-timeout = \"10ms\"",
   113  		"max-txn-time-use = 590",
   114  	}
   115  
   116  	if err := util.WriteFile(ctx, node, tidbConfig, strconv.Quote(strings.Join(tidbCfs, "\n"))); err != nil {
   117  		return err
   118  	}
   119  
   120  	return cluster.start(ctx, node, true)
   121  }
   122  
   123  // TearDown tears down the database.
   124  func (cluster *Cluster) TearDown(ctx context.Context, nodes []string, node string) error {
   125  	return cluster.Kill(ctx, node)
   126  }
   127  
   128  // Start starts the database
   129  func (cluster *Cluster) Start(ctx context.Context, node string) error {
   130  	return cluster.start(ctx, node, false)
   131  }
   132  
   133  func (cluster *Cluster) start(ctx context.Context, node string, inSetUp bool) error {
   134  	log.Printf("start database on node %s", node)
   135  
   136  	initialClusterArgs := make([]string, len(cluster.nodes))
   137  	for i, n := range cluster.nodes {
   138  		initialClusterArgs[i] = fmt.Sprintf("%s=http://%s:2380", n, n)
   139  	}
   140  	pdArgs := []string{
   141  		fmt.Sprintf("--name=%s", node),
   142  		"--data-dir=pd",
   143  		"--client-urls=http://0.0.0.0:2379",
   144  		"--peer-urls=http://0.0.0.0:2380",
   145  		fmt.Sprintf("--advertise-client-urls=http://%s:2379", node),
   146  		fmt.Sprintf("--advertise-peer-urls=http://%s:2380", node),
   147  		fmt.Sprintf("--initial-cluster=%s", strings.Join(initialClusterArgs, ",")),
   148  		fmt.Sprintf("--log-file=%s", pdLog),
   149  		fmt.Sprintf("--config=%s", pdConfig),
   150  	}
   151  
   152  	log.Printf("start pd-server on node %s", node)
   153  	pdPID := path.Join(deployDir, "pd.pid")
   154  	opts := util.NewDaemonOptions(deployDir, pdPID)
   155  	if err := util.StartDaemon(ctx, node, opts, pdBinary, pdArgs...); err != nil {
   156  		return err
   157  	}
   158  
   159  	if inSetUp {
   160  		time.Sleep(5 * time.Second)
   161  	}
   162  
   163  	if !util.IsDaemonRunning(ctx, node, pdBinary, pdPID) {
   164  		return fmt.Errorf("fail to start pd on node %s", node)
   165  	}
   166  
   167  	pdEndpoints := make([]string, len(cluster.nodes))
   168  	for i, n := range cluster.nodes {
   169  		pdEndpoints[i] = fmt.Sprintf("%s:2379", n)
   170  	}
   171  
   172  	// Before starting TiKV, we should ensure PD cluster is ready.
   173  WAIT:
   174  	for i := 0; i < waitPDCount; i++ {
   175  		for _, ep := range pdEndpoints {
   176  			// Member API works when PD cluster is ready.
   177  			memberAPI := fmt.Sprintf("%s/pd/api/v1/members", ep)
   178  			// `--fail`, non-zero exit code on server errors.
   179  			_, err := ssh.CombinedOutput(ctx, node, "curl", "--fail", memberAPI)
   180  			if err == nil {
   181  				log.Println("PD cluster is ready")
   182  				break WAIT
   183  			}
   184  		}
   185  		log.Println("waiting PD cluster...")
   186  		time.Sleep(1 * time.Second)
   187  	}
   188  
   189  	tikvArgs := []string{
   190  		fmt.Sprintf("--pd=%s", strings.Join(pdEndpoints, ",")),
   191  		"--addr=0.0.0.0:20160",
   192  		fmt.Sprintf("--advertise-addr=%s:20160", node),
   193  		"--data-dir=tikv",
   194  		fmt.Sprintf("--log-file=%s", tikvLog),
   195  		fmt.Sprintf("--config=%s", tikvConfig),
   196  	}
   197  
   198  	log.Printf("start tikv-server on node %s", node)
   199  	tikvPID := path.Join(deployDir, "tikv.pid")
   200  	opts = util.NewDaemonOptions(deployDir, tikvPID)
   201  	if err := util.StartDaemon(ctx, node, opts, tikvBinary, tikvArgs...); err != nil {
   202  		return err
   203  	}
   204  
   205  	if !util.IsDaemonRunning(ctx, node, tikvBinary, tikvPID) {
   206  		return fmt.Errorf("fail to start tikv on node %s", node)
   207  	}
   208  
   209  	if cluster.IncludeTidb {
   210  		tidbArgs := []string{
   211  			"--store=tikv",
   212  			fmt.Sprintf("--path=%s", strings.Join(pdEndpoints, ",")),
   213  			fmt.Sprintf("--log-file=%s", tidbLog),
   214  			fmt.Sprintf("--config=%s", tidbConfig),
   215  		}
   216  
   217  		log.Printf("start tidb-server on node %s", node)
   218  		tidbPID := path.Join(deployDir, "tidb.pid")
   219  		opts = util.NewDaemonOptions(deployDir, tidbPID)
   220  		if err := util.StartDaemon(ctx, node, opts, tidbBinary, tidbArgs...); err != nil {
   221  			return err
   222  		}
   223  
   224  		var err error
   225  		if inSetUp {
   226  			for i := 0; i < 12; i++ {
   227  				if err = ssh.Exec(ctx, node, "curl", fmt.Sprintf("http://%s:10080/status", node)); err == nil {
   228  					break
   229  				}
   230  				log.Printf("try to wait tidb run on %s", node)
   231  				time.Sleep(10 * time.Second)
   232  			}
   233  		}
   234  
   235  		if err != nil {
   236  			return err
   237  		}
   238  
   239  		if !util.IsDaemonRunning(ctx, node, tidbBinary, tidbPID) {
   240  			return fmt.Errorf("fail to start tidb on node %s", node)
   241  		}
   242  	}
   243  
   244  	return nil
   245  }
   246  
   247  // Stop stops the database
   248  func (cluster *Cluster) Stop(ctx context.Context, node string) error {
   249  	if cluster.IncludeTidb {
   250  		if err := util.StopDaemon(ctx, node, tidbBinary, path.Join(deployDir, "tidb.pid")); err != nil {
   251  			return err
   252  		}
   253  	}
   254  
   255  	if err := util.StopDaemon(ctx, node, tikvBinary, path.Join(deployDir, "tikv.pid")); err != nil {
   256  		return err
   257  	}
   258  
   259  	return util.StopDaemon(ctx, node, pdBinary, path.Join(deployDir, "pd.pid"))
   260  }
   261  
   262  // Kill kills the database
   263  func (cluster *Cluster) Kill(ctx context.Context, node string) error {
   264  	if cluster.IncludeTidb {
   265  		if err := util.KillDaemon(ctx, node, tidbBinary, path.Join(deployDir, "tidb.pid")); err != nil {
   266  			return err
   267  		}
   268  	}
   269  
   270  	if err := util.KillDaemon(ctx, node, tikvBinary, path.Join(deployDir, "tikv.pid")); err != nil {
   271  		return err
   272  	}
   273  
   274  	return util.KillDaemon(ctx, node, pdBinary, path.Join(deployDir, "pd.pid"))
   275  }
   276  
   277  // IsRunning checks whether the database is running or not
   278  func (cluster *Cluster) IsRunning(ctx context.Context, node string) bool {
   279  	if cluster.IncludeTidb {
   280  		return util.IsDaemonRunning(ctx, node, tidbBinary, path.Join(deployDir, "tidb.pid"))
   281  	}
   282  	return util.IsDaemonRunning(ctx, node, tidbBinary, path.Join(deployDir, "tikv.pid"))
   283  }
   284  
   285  // Name returns the unique name for the database
   286  func (cluster *Cluster) Name() string {
   287  	return "cluster"
   288  }