vitess.io/vitess@v0.16.2/go/vt/zkctl/zkctl.go (about)

     1  /*
     2  Copyright 2019 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  /*
    18  Commands for controlling an external zookeeper process.
    19  */
    20  
    21  package zkctl
    22  
    23  import (
    24  	"bytes"
    25  	"fmt"
    26  	"net"
    27  	"os"
    28  	"os/exec"
    29  	"path"
    30  	"strconv"
    31  	"syscall"
    32  	"time"
    33  
    34  	zookeeper "github.com/z-division/go-zookeeper/zk"
    35  
    36  	"vitess.io/vitess/go/vt/env"
    37  	"vitess.io/vitess/go/vt/log"
    38  )
    39  
    40  const (
    41  	// startWaitTime is how long to wait at Start.
    42  	startWaitTime = 30 * time.Second
    43  	// shutdownWaitTime is how long to wait at Shutdown.
    44  	shutdownWaitTime = 20 * time.Second
    45  )
    46  
    47  // Zkd manages the running of ZooKeeper servers.
    48  type Zkd struct {
    49  	config *ZkConfig
    50  	done   chan struct{}
    51  }
    52  
    53  // NewZkd creates a Zkd.
    54  func NewZkd(config *ZkConfig) *Zkd {
    55  	return &Zkd{config: config}
    56  }
    57  
    58  // Done returns a channel that is closed when the underlying process started
    59  // by this Zkd has terminated. If the process was started by someone else, this
    60  // channel will never be closed.
    61  func (zkd *Zkd) Done() <-chan struct{} {
    62  	return zkd.done
    63  }
    64  
    65  /*
    66   ZOO_LOG_DIR=""
    67   ZOO_CFG="/.../zoo.cfg"
    68   ZOOMAIN="org.apache.zookeeper.server.quorum.QuorumPeerMain"
    69   java -DZOO_LOG_DIR=${ZOO_LOG_DIR} -cp $CLASSPATH $ZOOMAIN $YT_ZK_CFG
    70  */
    71  
    72  // Start runs an already initialized ZooKeeper server.
    73  func (zkd *Zkd) Start() error {
    74  	log.Infof("zkctl.Start")
    75  	// NOTE(msolomon) use a script here so we can detach and continue to run
    76  	// if the wrangler process dies. this pretty much the same as mysqld_safe.
    77  	args := []string{
    78  		zkd.config.LogDir(),
    79  		zkd.config.ConfigFile(),
    80  		zkd.config.PidFile(),
    81  	}
    82  	root, err := env.VtRoot()
    83  	if err != nil {
    84  		return err
    85  	}
    86  	dir := path.Join(root, "bin")
    87  	cmd := exec.Command(path.Join(dir, "zksrv.sh"), args...)
    88  	cmd.Env = os.Environ()
    89  	cmd.Dir = dir
    90  
    91  	if err = cmd.Start(); err != nil {
    92  		return err
    93  	}
    94  
    95  	// give it some time to succeed - usually by the time the socket emerges
    96  	// we are in good shape, but not always. So let's continue to retry until
    97  	// we get an imok response from the socket or we timeout.
    98  	timeout := time.Now().Add(startWaitTime)
    99  	zkAddr := fmt.Sprintf(":%v", zkd.config.ClientPort)
   100  	for time.Now().Before(timeout) {
   101  		conn, connErr := net.Dial("tcp", zkAddr)
   102  		if connErr != nil {
   103  			err = connErr
   104  		} else {
   105  			conn.Write([]byte("ruok"))
   106  			reply := make([]byte, 4)
   107  			conn.Read(reply)
   108  			conn.Close()
   109  			if string(reply) == "imok" {
   110  				err = nil
   111  				break
   112  			}
   113  			err = fmt.Errorf("local zk unhealthy: %v %v", zkAddr, reply)
   114  		}
   115  		time.Sleep(time.Second)
   116  	}
   117  	if err != nil {
   118  		return err
   119  	}
   120  	zkd.done = make(chan struct{})
   121  	go func(done chan<- struct{}) {
   122  		// wait so we don't get a bunch of defunct processes
   123  		cmd.Wait()
   124  		close(done)
   125  	}(zkd.done)
   126  	return err
   127  }
   128  
   129  // Shutdown kills a ZooKeeper server, but keeps its data dir intact.
   130  func (zkd *Zkd) Shutdown() error {
   131  	log.Infof("zkctl.Shutdown")
   132  	pidData, err := os.ReadFile(zkd.config.PidFile())
   133  	if err != nil {
   134  		return err
   135  	}
   136  	pid, err := strconv.Atoi(string(bytes.TrimSpace(pidData)))
   137  	if err != nil {
   138  		return err
   139  	}
   140  	err = syscall.Kill(pid, syscall.SIGKILL)
   141  	if err != nil && err != syscall.ESRCH {
   142  		return err
   143  	}
   144  	timeout := time.Now().Add(shutdownWaitTime)
   145  	for time.Now().Before(timeout) {
   146  		if syscall.Kill(pid, syscall.SIGKILL) == syscall.ESRCH {
   147  			return nil
   148  		}
   149  		time.Sleep(time.Second)
   150  	}
   151  	return fmt.Errorf("Shutdown didn't kill process %v", pid)
   152  }
   153  
   154  func (zkd *Zkd) makeCfg() (string, error) {
   155  	root, err := env.VtRoot()
   156  	if err != nil {
   157  		return "", err
   158  	}
   159  	cnfTemplatePaths := []string{path.Join(root, "config/zkcfg/zoo.cfg")}
   160  	return MakeZooCfg(cnfTemplatePaths, zkd.config, "# generated by vt")
   161  }
   162  
   163  // Init generates a new config and then starts ZooKeeper.
   164  func (zkd *Zkd) Init() error {
   165  	if zkd.Inited() {
   166  		return fmt.Errorf("zk already inited")
   167  	}
   168  
   169  	log.Infof("zkd.Init")
   170  	for _, path := range zkd.config.DirectoryList() {
   171  		if err := os.MkdirAll(path, 0775); err != nil {
   172  			log.Errorf("%v", err)
   173  			return err
   174  		}
   175  		// FIXME(msolomon) validate permissions?
   176  	}
   177  
   178  	configData, err := zkd.makeCfg()
   179  	if err == nil {
   180  		err = os.WriteFile(zkd.config.ConfigFile(), []byte(configData), 0664)
   181  	}
   182  	if err != nil {
   183  		log.Errorf("failed creating %v: %v", zkd.config.ConfigFile(), err)
   184  		return err
   185  	}
   186  
   187  	err = zkd.config.WriteMyid()
   188  	if err != nil {
   189  		log.Errorf("failed creating %v: %v", zkd.config.MyidFile(), err)
   190  		return err
   191  	}
   192  
   193  	if err = zkd.Start(); err != nil {
   194  		log.Errorf("failed starting, check %v", zkd.config.LogDir())
   195  		return err
   196  	}
   197  
   198  	var (
   199  		zk      *zookeeper.Conn
   200  		session <-chan zookeeper.Event
   201  		zkAddr  = fmt.Sprintf("localhost:%v", zkd.config.ClientPort)
   202  	)
   203  
   204  	// Let's retry to deal with ephemeral network issues or CI slowness.
   205  	timeout := time.Now().Add(startWaitTime)
   206  	for time.Now().Before(timeout) {
   207  		zk, session, err = zookeeper.Connect([]string{zkAddr}, startWaitTime)
   208  		if err == nil {
   209  			break
   210  		}
   211  		time.Sleep(1 * time.Second)
   212  	}
   213  	if err != nil {
   214  		return err
   215  	}
   216  	event := <-session
   217  	if event.State != zookeeper.StateConnecting {
   218  		return event.Err
   219  	}
   220  	event = <-session
   221  	if event.State != zookeeper.StateConnected {
   222  		return event.Err
   223  	}
   224  	defer zk.Close()
   225  
   226  	return nil
   227  }
   228  
   229  // Teardown shuts down the server and removes its data dir.
   230  func (zkd *Zkd) Teardown() error {
   231  	log.Infof("zkctl.Teardown")
   232  	if err := zkd.Shutdown(); err != nil {
   233  		log.Warningf("failed zookeeper shutdown: %v", err.Error())
   234  	}
   235  	var removalErr error
   236  	for _, dir := range zkd.config.DirectoryList() {
   237  		log.V(6).Infof("remove data dir %v", dir)
   238  		if err := os.RemoveAll(dir); err != nil {
   239  			log.Errorf("failed removing %v: %v", dir, err.Error())
   240  			removalErr = err
   241  		}
   242  	}
   243  	return removalErr
   244  }
   245  
   246  // Inited returns true if the server config has been initialized.
   247  func (zkd *Zkd) Inited() bool {
   248  	myidFile := zkd.config.MyidFile()
   249  	_, statErr := os.Stat(myidFile)
   250  	if statErr == nil {
   251  		return true
   252  	} else if statErr.(*os.PathError).Err != syscall.ENOENT {
   253  		panic("can't access file " + myidFile + ": " + statErr.Error())
   254  	}
   255  	return false
   256  }