github.com/kjdelisle/consul@v1.4.5/command/lock/lock.go (about)

     1  package lock
     2  
     3  import (
     4  	"flag"
     5  	"fmt"
     6  	"os"
     7  	osexec "os/exec"
     8  	"path"
     9  	"strings"
    10  	"sync"
    11  	"syscall"
    12  	"time"
    13  
    14  	"github.com/hashicorp/consul/agent"
    15  	"github.com/hashicorp/consul/agent/exec"
    16  	"github.com/hashicorp/consul/api"
    17  	"github.com/hashicorp/consul/command/flags"
    18  	"github.com/mitchellh/cli"
    19  )
    20  
    21  const (
    22  	// lockKillGracePeriod is how long we allow a child between
    23  	// a SIGTERM and a SIGKILL. This is to let the child cleanup
    24  	// any necessary state. We have to balance this with the risk
    25  	// of a split-brain where multiple children may be acting as if
    26  	// they hold a lock. This value is currently based on the default
    27  	// lock-delay value of 15 seconds. This only affects locks and not
    28  	// semaphores.
    29  	lockKillGracePeriod = 5 * time.Second
    30  
    31  	// defaultMonitorRetry is the number of 500 errors we will tolerate
    32  	// before declaring the lock gone.
    33  	defaultMonitorRetry = 3
    34  
    35  	// defaultMonitorRetryTime is the amount of time to wait between
    36  	// retries.
    37  	defaultMonitorRetryTime = 1 * time.Second
    38  )
    39  
    40  // LockCommand is a Command implementation that is used to setup
    41  // a "lock" which manages lock acquisition and invokes a sub-process
    42  type cmd struct {
    43  	UI    cli.Ui
    44  	flags *flag.FlagSet
    45  	http  *flags.HTTPFlags
    46  	help  string
    47  
    48  	ShutdownCh <-chan struct{}
    49  
    50  	child     *os.Process
    51  	childLock sync.Mutex
    52  	verbose   bool
    53  
    54  	// flags
    55  	limit              int
    56  	monitorRetry       int
    57  	name               string
    58  	passStdin          bool
    59  	propagateChildCode bool
    60  	shell              bool
    61  	timeout            time.Duration
    62  }
    63  
    64  func New(ui cli.Ui) *cmd {
    65  	c := &cmd{UI: ui}
    66  	c.init()
    67  	return c
    68  }
    69  
    70  func (c *cmd) init() {
    71  	c.flags = flag.NewFlagSet("", flag.ContinueOnError)
    72  	c.flags.BoolVar(&c.propagateChildCode, "child-exit-code", false,
    73  		"Exit 2 if the child process exited with an error if this is true, "+
    74  			"otherwise this doesn't propagate an error from the child. The "+
    75  			"default value is false.")
    76  	c.flags.IntVar(&c.limit, "n", 1,
    77  		"Optional limit on the number of concurrent lock holders. The underlying "+
    78  			"implementation switches from a lock to a semaphore when the value is "+
    79  			"greater than 1. The default value is 1.")
    80  	c.flags.IntVar(&c.monitorRetry, "monitor-retry", defaultMonitorRetry,
    81  		"Number of times to retry if Consul returns a 500 error while monitoring "+
    82  			"the lock. This allows riding out brief periods of unavailability "+
    83  			"without causing leader elections, but increases the amount of time "+
    84  			"required to detect a lost lock in some cases. The default value is 3, "+
    85  			"with a 1s wait between retries. Set this value to 0 to disable retires.")
    86  	c.flags.StringVar(&c.name, "name", "",
    87  		"Optional name to associate with the lock session. It not provided, one "+
    88  			"is generated based on the provided child command.")
    89  	c.flags.BoolVar(&c.passStdin, "pass-stdin", false,
    90  		"Pass stdin to the child process.")
    91  	c.flags.BoolVar(&c.shell, "shell", true,
    92  		"Use a shell to run the command (can set a custom shell via the SHELL "+
    93  			"environment variable).")
    94  	c.flags.DurationVar(&c.timeout, "timeout", 0,
    95  		"Maximum amount of time to wait to acquire the lock, specified as a "+
    96  			"duration like \"1s\" or \"3h\". The default value is 0.")
    97  	c.flags.BoolVar(&c.verbose, "verbose", false,
    98  		"Enable verbose (debugging) output.")
    99  
   100  	// Deprecations
   101  	c.flags.DurationVar(&c.timeout, "try", 0,
   102  		"DEPRECATED. Use -timeout instead.")
   103  
   104  	c.http = &flags.HTTPFlags{}
   105  	flags.Merge(c.flags, c.http.ClientFlags())
   106  	flags.Merge(c.flags, c.http.ServerFlags())
   107  	c.help = flags.Usage(help, c.flags)
   108  }
   109  
   110  func (c *cmd) Run(args []string) int {
   111  	var lu *LockUnlock
   112  	return c.run(args, &lu)
   113  }
   114  
   115  func (c *cmd) run(args []string, lu **LockUnlock) int {
   116  	if err := c.flags.Parse(args); err != nil {
   117  		return 1
   118  	}
   119  
   120  	// Check the limit
   121  	if c.limit <= 0 {
   122  		c.UI.Error(fmt.Sprintf("Lock holder limit must be positive"))
   123  		return 1
   124  	}
   125  
   126  	// Verify the prefix and child are provided
   127  	extra := c.flags.Args()
   128  	if len(extra) < 2 {
   129  		c.UI.Error("Key prefix and child command must be specified")
   130  		return 1
   131  	}
   132  	prefix := extra[0]
   133  	prefix = strings.TrimPrefix(prefix, "/")
   134  
   135  	if c.timeout < 0 {
   136  		c.UI.Error("Timeout must be positive")
   137  		return 1
   138  	}
   139  
   140  	// Calculate a session name if none provided
   141  	if c.name == "" {
   142  		c.name = fmt.Sprintf("Consul lock for '%s' at '%s'", strings.Join(extra[1:], " "), prefix)
   143  	}
   144  
   145  	// Calculate oneshot
   146  	oneshot := c.timeout > 0
   147  
   148  	// Check the retry parameter
   149  	if c.monitorRetry < 0 {
   150  		c.UI.Error("Number for 'monitor-retry' must be >= 0")
   151  		return 1
   152  	}
   153  
   154  	// Create and test the HTTP client
   155  	client, err := c.http.APIClient()
   156  	if err != nil {
   157  		c.UI.Error(fmt.Sprintf("Error connecting to Consul agent: %s", err))
   158  		return 1
   159  	}
   160  	_, err = client.Agent().NodeName()
   161  	if err != nil {
   162  		c.UI.Error(fmt.Sprintf("Error querying Consul agent: %s", err))
   163  		return 1
   164  	}
   165  
   166  	// Setup the lock or semaphore
   167  	if c.limit == 1 {
   168  		*lu, err = c.setupLock(client, prefix, c.name, oneshot, c.timeout, c.monitorRetry)
   169  	} else {
   170  		*lu, err = c.setupSemaphore(client, c.limit, prefix, c.name, oneshot, c.timeout, c.monitorRetry)
   171  	}
   172  	if err != nil {
   173  		c.UI.Error(fmt.Sprintf("Lock setup failed: %s", err))
   174  		return 1
   175  	}
   176  
   177  	// Attempt the acquisition
   178  	if c.verbose {
   179  		c.UI.Info("Attempting lock acquisition")
   180  	}
   181  	lockCh, err := (*lu).lockFn(c.ShutdownCh)
   182  	if lockCh == nil {
   183  		if err == nil {
   184  			c.UI.Error("Shutdown triggered or timeout during lock acquisition")
   185  		} else {
   186  			c.UI.Error(fmt.Sprintf("Lock acquisition failed: %s", err))
   187  		}
   188  		return 1
   189  	}
   190  
   191  	// Check if we were shutdown but managed to still acquire the lock
   192  	var childCode int
   193  	var childErr chan error
   194  	select {
   195  	case <-c.ShutdownCh:
   196  		c.UI.Error("Shutdown triggered during lock acquisition")
   197  		goto RELEASE
   198  	default:
   199  	}
   200  
   201  	// Start the child process
   202  	childErr = make(chan error, 1)
   203  	go func() {
   204  		childErr <- c.startChild(c.flags.Args()[1:], c.passStdin, c.shell)
   205  	}()
   206  
   207  	// Monitor for shutdown, child termination, or lock loss
   208  	select {
   209  	case <-c.ShutdownCh:
   210  		if c.verbose {
   211  			c.UI.Info("Shutdown triggered, killing child")
   212  		}
   213  	case <-lockCh:
   214  		if c.verbose {
   215  			c.UI.Info("Lock lost, killing child")
   216  		}
   217  	case err := <-childErr:
   218  		if err != nil {
   219  			childCode = 2
   220  		}
   221  		if c.verbose {
   222  			c.UI.Info("Child terminated, releasing lock")
   223  		}
   224  		goto RELEASE
   225  	}
   226  
   227  	// Prevent starting a new child.  The lock is never released
   228  	// after this point.
   229  	c.childLock.Lock()
   230  
   231  	// Kill any existing child
   232  	if err := c.killChild(childErr); err != nil {
   233  		c.UI.Error(fmt.Sprintf("%s", err))
   234  	}
   235  
   236  RELEASE:
   237  	// Release the lock before termination
   238  	if err := (*lu).unlockFn(); err != nil {
   239  		c.UI.Error(fmt.Sprintf("Lock release failed: %s", err))
   240  		return 1
   241  	}
   242  
   243  	// Cleanup the lock if no longer in use
   244  	if err := (*lu).cleanupFn(); err != nil {
   245  		if err != (*lu).inUseErr {
   246  			c.UI.Error(fmt.Sprintf("Lock cleanup failed: %s", err))
   247  			return 1
   248  		} else if c.verbose {
   249  			c.UI.Info("Cleanup aborted, lock in use")
   250  		}
   251  	} else if c.verbose {
   252  		c.UI.Info("Cleanup succeeded")
   253  	}
   254  
   255  	// If we detected an error from the child process then we propagate
   256  	// that.
   257  	if c.propagateChildCode {
   258  		return childCode
   259  	}
   260  
   261  	return 0
   262  }
   263  
   264  // setupLock is used to setup a new Lock given the API client, the key prefix to
   265  // operate on, and an optional session name. If oneshot is true then we will set
   266  // up for a single attempt at acquisition, using the given wait time. The retry
   267  // parameter sets how many 500 errors the lock monitor will tolerate before
   268  // giving up the lock.
   269  func (c *cmd) setupLock(client *api.Client, prefix, name string,
   270  	oneshot bool, wait time.Duration, retry int) (*LockUnlock, error) {
   271  	// Use the DefaultSemaphoreKey extension, this way if a lock and
   272  	// semaphore are both used at the same prefix, we will get a conflict
   273  	// which we can report to the user.
   274  	key := path.Join(prefix, api.DefaultSemaphoreKey)
   275  	if c.verbose {
   276  		c.UI.Info(fmt.Sprintf("Setting up lock at path: %s", key))
   277  	}
   278  	opts := api.LockOptions{
   279  		Key:              key,
   280  		SessionName:      name,
   281  		MonitorRetries:   retry,
   282  		MonitorRetryTime: defaultMonitorRetryTime,
   283  	}
   284  	if oneshot {
   285  		opts.LockTryOnce = true
   286  		opts.LockWaitTime = wait
   287  	}
   288  	l, err := client.LockOpts(&opts)
   289  	if err != nil {
   290  		return nil, err
   291  	}
   292  	lu := &LockUnlock{
   293  		lockFn:    l.Lock,
   294  		unlockFn:  l.Unlock,
   295  		cleanupFn: l.Destroy,
   296  		inUseErr:  api.ErrLockInUse,
   297  		rawOpts:   &opts,
   298  	}
   299  	return lu, nil
   300  }
   301  
   302  // setupSemaphore is used to setup a new Semaphore given the API client, key
   303  // prefix, session name, and slot holder limit. If oneshot is true then we will
   304  // set up for a single attempt at acquisition, using the given wait time. The
   305  // retry parameter sets how many 500 errors the lock monitor will tolerate
   306  // before giving up the semaphore.
   307  func (c *cmd) setupSemaphore(client *api.Client, limit int, prefix, name string,
   308  	oneshot bool, wait time.Duration, retry int) (*LockUnlock, error) {
   309  	if c.verbose {
   310  		c.UI.Info(fmt.Sprintf("Setting up semaphore (limit %d) at prefix: %s", limit, prefix))
   311  	}
   312  	opts := api.SemaphoreOptions{
   313  		Prefix:           prefix,
   314  		Limit:            limit,
   315  		SessionName:      name,
   316  		MonitorRetries:   retry,
   317  		MonitorRetryTime: defaultMonitorRetryTime,
   318  	}
   319  	if oneshot {
   320  		opts.SemaphoreTryOnce = true
   321  		opts.SemaphoreWaitTime = wait
   322  	}
   323  	s, err := client.SemaphoreOpts(&opts)
   324  	if err != nil {
   325  		return nil, err
   326  	}
   327  	lu := &LockUnlock{
   328  		lockFn:    s.Acquire,
   329  		unlockFn:  s.Release,
   330  		cleanupFn: s.Destroy,
   331  		inUseErr:  api.ErrSemaphoreInUse,
   332  		rawOpts:   &opts,
   333  	}
   334  	return lu, nil
   335  }
   336  
   337  // startChild is a long running routine used to start and
   338  // wait for the child process to exit.
   339  func (c *cmd) startChild(args []string, passStdin, shell bool) error {
   340  	if c.verbose {
   341  		c.UI.Info("Starting handler")
   342  	}
   343  
   344  	// Create the command
   345  	var cmd *osexec.Cmd
   346  	var err error
   347  	if !shell {
   348  		cmd, err = exec.Subprocess(args)
   349  	} else {
   350  		cmd, err = exec.Script(strings.Join(args, " "))
   351  	}
   352  	if err != nil {
   353  		c.UI.Error(fmt.Sprintf("Error executing handler: %s", err))
   354  		return err
   355  	}
   356  
   357  	// Setup the command streams
   358  	cmd.Env = append(os.Environ(),
   359  		"CONSUL_LOCK_HELD=true",
   360  	)
   361  	if passStdin {
   362  		if c.verbose {
   363  			c.UI.Info("Stdin passed to handler process")
   364  		}
   365  		cmd.Stdin = os.Stdin
   366  	} else {
   367  		cmd.Stdin = nil
   368  	}
   369  	cmd.Stdout = os.Stdout
   370  	cmd.Stderr = os.Stderr
   371  
   372  	// Start the child process
   373  	c.childLock.Lock()
   374  	if err := cmd.Start(); err != nil {
   375  		c.UI.Error(fmt.Sprintf("Error starting handler: %s", err))
   376  		c.childLock.Unlock()
   377  		return err
   378  	}
   379  
   380  	// Set up signal forwarding.
   381  	doneCh := make(chan struct{})
   382  	defer close(doneCh)
   383  	logFn := func(err error) {
   384  		c.UI.Error(fmt.Sprintf("Warning, could not forward signal: %s", err))
   385  	}
   386  	agent.ForwardSignals(cmd, logFn, doneCh)
   387  
   388  	// Setup the child info
   389  	c.child = cmd.Process
   390  	c.childLock.Unlock()
   391  
   392  	// Wait for the child process
   393  	if err := cmd.Wait(); err != nil {
   394  		c.UI.Error(fmt.Sprintf("Error running handler: %s", err))
   395  		return err
   396  	}
   397  	return nil
   398  }
   399  
   400  // killChild is used to forcefully kill the child, first using SIGTERM
   401  // to allow for a graceful cleanup and then using SIGKILL for a hard
   402  // termination.
   403  // On Windows, the child is always hard terminated with a SIGKILL, even
   404  // on the first attempt.
   405  func (c *cmd) killChild(childErr chan error) error {
   406  	// Get the child process
   407  	child := c.child
   408  
   409  	// If there is no child process (failed to start), we can quit early
   410  	if child == nil {
   411  		if c.verbose {
   412  			c.UI.Info("No child process to kill")
   413  		}
   414  		return nil
   415  	}
   416  
   417  	// Attempt termination first
   418  	if c.verbose {
   419  		c.UI.Info(fmt.Sprintf("Terminating child pid %d", child.Pid))
   420  	}
   421  	if err := signalPid(child.Pid, syscall.SIGTERM); err != nil {
   422  		return fmt.Errorf("Failed to terminate %d: %v", child.Pid, err)
   423  	}
   424  
   425  	// Wait for termination, or until a timeout
   426  	select {
   427  	case <-childErr:
   428  		if c.verbose {
   429  			c.UI.Info("Child terminated")
   430  		}
   431  		return nil
   432  	case <-time.After(lockKillGracePeriod):
   433  		if c.verbose {
   434  			c.UI.Info(fmt.Sprintf("Child did not exit after grace period of %v",
   435  				lockKillGracePeriod))
   436  		}
   437  	}
   438  
   439  	// Send a final SIGKILL
   440  	if c.verbose {
   441  		c.UI.Info(fmt.Sprintf("Killing child pid %d", child.Pid))
   442  	}
   443  	if err := signalPid(child.Pid, syscall.SIGKILL); err != nil {
   444  		return fmt.Errorf("Failed to kill %d: %v", child.Pid, err)
   445  	}
   446  	return nil
   447  }
   448  
   449  func (c *cmd) Synopsis() string {
   450  	return synopsis
   451  }
   452  
   453  func (c *cmd) Help() string {
   454  	return c.help
   455  }
   456  
   457  // LockUnlock is used to abstract over the differences between
   458  // a lock and a semaphore.
   459  type LockUnlock struct {
   460  	lockFn    func(<-chan struct{}) (<-chan struct{}, error)
   461  	unlockFn  func() error
   462  	cleanupFn func() error
   463  	inUseErr  error
   464  	rawOpts   interface{}
   465  }
   466  
   467  const synopsis = "Execute a command holding a lock"
   468  const help = `
   469  Usage: consul lock [options] prefix child...
   470  
   471    Acquires a lock or semaphore at a given path, and invokes a child process
   472    when successful. The child process can assume the lock is held while it
   473    executes. If the lock is lost or communication is disrupted the child
   474    process will be sent a SIGTERM signal and given time to gracefully exit.
   475    After the grace period expires the process will be hard terminated.
   476  
   477    For Consul agents on Windows, the child process is always hard terminated
   478    with a SIGKILL, since Windows has no POSIX compatible notion for SIGTERM.
   479  
   480    When -n=1, only a single lock holder or leader exists providing mutual
   481    exclusion. Setting a higher value switches to a semaphore allowing multiple
   482    holders to coordinate.
   483  
   484    The prefix provided must have write privileges.
   485  `