github.com/kjdelisle/consul@v1.4.5/command/lock/lock.go (about) 1 package lock 2 3 import ( 4 "flag" 5 "fmt" 6 "os" 7 osexec "os/exec" 8 "path" 9 "strings" 10 "sync" 11 "syscall" 12 "time" 13 14 "github.com/hashicorp/consul/agent" 15 "github.com/hashicorp/consul/agent/exec" 16 "github.com/hashicorp/consul/api" 17 "github.com/hashicorp/consul/command/flags" 18 "github.com/mitchellh/cli" 19 ) 20 21 const ( 22 // lockKillGracePeriod is how long we allow a child between 23 // a SIGTERM and a SIGKILL. This is to let the child cleanup 24 // any necessary state. We have to balance this with the risk 25 // of a split-brain where multiple children may be acting as if 26 // they hold a lock. This value is currently based on the default 27 // lock-delay value of 15 seconds. This only affects locks and not 28 // semaphores. 29 lockKillGracePeriod = 5 * time.Second 30 31 // defaultMonitorRetry is the number of 500 errors we will tolerate 32 // before declaring the lock gone. 33 defaultMonitorRetry = 3 34 35 // defaultMonitorRetryTime is the amount of time to wait between 36 // retries. 37 defaultMonitorRetryTime = 1 * time.Second 38 ) 39 40 // LockCommand is a Command implementation that is used to setup 41 // a "lock" which manages lock acquisition and invokes a sub-process 42 type cmd struct { 43 UI cli.Ui 44 flags *flag.FlagSet 45 http *flags.HTTPFlags 46 help string 47 48 ShutdownCh <-chan struct{} 49 50 child *os.Process 51 childLock sync.Mutex 52 verbose bool 53 54 // flags 55 limit int 56 monitorRetry int 57 name string 58 passStdin bool 59 propagateChildCode bool 60 shell bool 61 timeout time.Duration 62 } 63 64 func New(ui cli.Ui) *cmd { 65 c := &cmd{UI: ui} 66 c.init() 67 return c 68 } 69 70 func (c *cmd) init() { 71 c.flags = flag.NewFlagSet("", flag.ContinueOnError) 72 c.flags.BoolVar(&c.propagateChildCode, "child-exit-code", false, 73 "Exit 2 if the child process exited with an error if this is true, "+ 74 "otherwise this doesn't propagate an error from the child. The "+ 75 "default value is false.") 76 c.flags.IntVar(&c.limit, "n", 1, 77 "Optional limit on the number of concurrent lock holders. The underlying "+ 78 "implementation switches from a lock to a semaphore when the value is "+ 79 "greater than 1. The default value is 1.") 80 c.flags.IntVar(&c.monitorRetry, "monitor-retry", defaultMonitorRetry, 81 "Number of times to retry if Consul returns a 500 error while monitoring "+ 82 "the lock. This allows riding out brief periods of unavailability "+ 83 "without causing leader elections, but increases the amount of time "+ 84 "required to detect a lost lock in some cases. The default value is 3, "+ 85 "with a 1s wait between retries. Set this value to 0 to disable retires.") 86 c.flags.StringVar(&c.name, "name", "", 87 "Optional name to associate with the lock session. It not provided, one "+ 88 "is generated based on the provided child command.") 89 c.flags.BoolVar(&c.passStdin, "pass-stdin", false, 90 "Pass stdin to the child process.") 91 c.flags.BoolVar(&c.shell, "shell", true, 92 "Use a shell to run the command (can set a custom shell via the SHELL "+ 93 "environment variable).") 94 c.flags.DurationVar(&c.timeout, "timeout", 0, 95 "Maximum amount of time to wait to acquire the lock, specified as a "+ 96 "duration like \"1s\" or \"3h\". The default value is 0.") 97 c.flags.BoolVar(&c.verbose, "verbose", false, 98 "Enable verbose (debugging) output.") 99 100 // Deprecations 101 c.flags.DurationVar(&c.timeout, "try", 0, 102 "DEPRECATED. Use -timeout instead.") 103 104 c.http = &flags.HTTPFlags{} 105 flags.Merge(c.flags, c.http.ClientFlags()) 106 flags.Merge(c.flags, c.http.ServerFlags()) 107 c.help = flags.Usage(help, c.flags) 108 } 109 110 func (c *cmd) Run(args []string) int { 111 var lu *LockUnlock 112 return c.run(args, &lu) 113 } 114 115 func (c *cmd) run(args []string, lu **LockUnlock) int { 116 if err := c.flags.Parse(args); err != nil { 117 return 1 118 } 119 120 // Check the limit 121 if c.limit <= 0 { 122 c.UI.Error(fmt.Sprintf("Lock holder limit must be positive")) 123 return 1 124 } 125 126 // Verify the prefix and child are provided 127 extra := c.flags.Args() 128 if len(extra) < 2 { 129 c.UI.Error("Key prefix and child command must be specified") 130 return 1 131 } 132 prefix := extra[0] 133 prefix = strings.TrimPrefix(prefix, "/") 134 135 if c.timeout < 0 { 136 c.UI.Error("Timeout must be positive") 137 return 1 138 } 139 140 // Calculate a session name if none provided 141 if c.name == "" { 142 c.name = fmt.Sprintf("Consul lock for '%s' at '%s'", strings.Join(extra[1:], " "), prefix) 143 } 144 145 // Calculate oneshot 146 oneshot := c.timeout > 0 147 148 // Check the retry parameter 149 if c.monitorRetry < 0 { 150 c.UI.Error("Number for 'monitor-retry' must be >= 0") 151 return 1 152 } 153 154 // Create and test the HTTP client 155 client, err := c.http.APIClient() 156 if err != nil { 157 c.UI.Error(fmt.Sprintf("Error connecting to Consul agent: %s", err)) 158 return 1 159 } 160 _, err = client.Agent().NodeName() 161 if err != nil { 162 c.UI.Error(fmt.Sprintf("Error querying Consul agent: %s", err)) 163 return 1 164 } 165 166 // Setup the lock or semaphore 167 if c.limit == 1 { 168 *lu, err = c.setupLock(client, prefix, c.name, oneshot, c.timeout, c.monitorRetry) 169 } else { 170 *lu, err = c.setupSemaphore(client, c.limit, prefix, c.name, oneshot, c.timeout, c.monitorRetry) 171 } 172 if err != nil { 173 c.UI.Error(fmt.Sprintf("Lock setup failed: %s", err)) 174 return 1 175 } 176 177 // Attempt the acquisition 178 if c.verbose { 179 c.UI.Info("Attempting lock acquisition") 180 } 181 lockCh, err := (*lu).lockFn(c.ShutdownCh) 182 if lockCh == nil { 183 if err == nil { 184 c.UI.Error("Shutdown triggered or timeout during lock acquisition") 185 } else { 186 c.UI.Error(fmt.Sprintf("Lock acquisition failed: %s", err)) 187 } 188 return 1 189 } 190 191 // Check if we were shutdown but managed to still acquire the lock 192 var childCode int 193 var childErr chan error 194 select { 195 case <-c.ShutdownCh: 196 c.UI.Error("Shutdown triggered during lock acquisition") 197 goto RELEASE 198 default: 199 } 200 201 // Start the child process 202 childErr = make(chan error, 1) 203 go func() { 204 childErr <- c.startChild(c.flags.Args()[1:], c.passStdin, c.shell) 205 }() 206 207 // Monitor for shutdown, child termination, or lock loss 208 select { 209 case <-c.ShutdownCh: 210 if c.verbose { 211 c.UI.Info("Shutdown triggered, killing child") 212 } 213 case <-lockCh: 214 if c.verbose { 215 c.UI.Info("Lock lost, killing child") 216 } 217 case err := <-childErr: 218 if err != nil { 219 childCode = 2 220 } 221 if c.verbose { 222 c.UI.Info("Child terminated, releasing lock") 223 } 224 goto RELEASE 225 } 226 227 // Prevent starting a new child. The lock is never released 228 // after this point. 229 c.childLock.Lock() 230 231 // Kill any existing child 232 if err := c.killChild(childErr); err != nil { 233 c.UI.Error(fmt.Sprintf("%s", err)) 234 } 235 236 RELEASE: 237 // Release the lock before termination 238 if err := (*lu).unlockFn(); err != nil { 239 c.UI.Error(fmt.Sprintf("Lock release failed: %s", err)) 240 return 1 241 } 242 243 // Cleanup the lock if no longer in use 244 if err := (*lu).cleanupFn(); err != nil { 245 if err != (*lu).inUseErr { 246 c.UI.Error(fmt.Sprintf("Lock cleanup failed: %s", err)) 247 return 1 248 } else if c.verbose { 249 c.UI.Info("Cleanup aborted, lock in use") 250 } 251 } else if c.verbose { 252 c.UI.Info("Cleanup succeeded") 253 } 254 255 // If we detected an error from the child process then we propagate 256 // that. 257 if c.propagateChildCode { 258 return childCode 259 } 260 261 return 0 262 } 263 264 // setupLock is used to setup a new Lock given the API client, the key prefix to 265 // operate on, and an optional session name. If oneshot is true then we will set 266 // up for a single attempt at acquisition, using the given wait time. The retry 267 // parameter sets how many 500 errors the lock monitor will tolerate before 268 // giving up the lock. 269 func (c *cmd) setupLock(client *api.Client, prefix, name string, 270 oneshot bool, wait time.Duration, retry int) (*LockUnlock, error) { 271 // Use the DefaultSemaphoreKey extension, this way if a lock and 272 // semaphore are both used at the same prefix, we will get a conflict 273 // which we can report to the user. 274 key := path.Join(prefix, api.DefaultSemaphoreKey) 275 if c.verbose { 276 c.UI.Info(fmt.Sprintf("Setting up lock at path: %s", key)) 277 } 278 opts := api.LockOptions{ 279 Key: key, 280 SessionName: name, 281 MonitorRetries: retry, 282 MonitorRetryTime: defaultMonitorRetryTime, 283 } 284 if oneshot { 285 opts.LockTryOnce = true 286 opts.LockWaitTime = wait 287 } 288 l, err := client.LockOpts(&opts) 289 if err != nil { 290 return nil, err 291 } 292 lu := &LockUnlock{ 293 lockFn: l.Lock, 294 unlockFn: l.Unlock, 295 cleanupFn: l.Destroy, 296 inUseErr: api.ErrLockInUse, 297 rawOpts: &opts, 298 } 299 return lu, nil 300 } 301 302 // setupSemaphore is used to setup a new Semaphore given the API client, key 303 // prefix, session name, and slot holder limit. If oneshot is true then we will 304 // set up for a single attempt at acquisition, using the given wait time. The 305 // retry parameter sets how many 500 errors the lock monitor will tolerate 306 // before giving up the semaphore. 307 func (c *cmd) setupSemaphore(client *api.Client, limit int, prefix, name string, 308 oneshot bool, wait time.Duration, retry int) (*LockUnlock, error) { 309 if c.verbose { 310 c.UI.Info(fmt.Sprintf("Setting up semaphore (limit %d) at prefix: %s", limit, prefix)) 311 } 312 opts := api.SemaphoreOptions{ 313 Prefix: prefix, 314 Limit: limit, 315 SessionName: name, 316 MonitorRetries: retry, 317 MonitorRetryTime: defaultMonitorRetryTime, 318 } 319 if oneshot { 320 opts.SemaphoreTryOnce = true 321 opts.SemaphoreWaitTime = wait 322 } 323 s, err := client.SemaphoreOpts(&opts) 324 if err != nil { 325 return nil, err 326 } 327 lu := &LockUnlock{ 328 lockFn: s.Acquire, 329 unlockFn: s.Release, 330 cleanupFn: s.Destroy, 331 inUseErr: api.ErrSemaphoreInUse, 332 rawOpts: &opts, 333 } 334 return lu, nil 335 } 336 337 // startChild is a long running routine used to start and 338 // wait for the child process to exit. 339 func (c *cmd) startChild(args []string, passStdin, shell bool) error { 340 if c.verbose { 341 c.UI.Info("Starting handler") 342 } 343 344 // Create the command 345 var cmd *osexec.Cmd 346 var err error 347 if !shell { 348 cmd, err = exec.Subprocess(args) 349 } else { 350 cmd, err = exec.Script(strings.Join(args, " ")) 351 } 352 if err != nil { 353 c.UI.Error(fmt.Sprintf("Error executing handler: %s", err)) 354 return err 355 } 356 357 // Setup the command streams 358 cmd.Env = append(os.Environ(), 359 "CONSUL_LOCK_HELD=true", 360 ) 361 if passStdin { 362 if c.verbose { 363 c.UI.Info("Stdin passed to handler process") 364 } 365 cmd.Stdin = os.Stdin 366 } else { 367 cmd.Stdin = nil 368 } 369 cmd.Stdout = os.Stdout 370 cmd.Stderr = os.Stderr 371 372 // Start the child process 373 c.childLock.Lock() 374 if err := cmd.Start(); err != nil { 375 c.UI.Error(fmt.Sprintf("Error starting handler: %s", err)) 376 c.childLock.Unlock() 377 return err 378 } 379 380 // Set up signal forwarding. 381 doneCh := make(chan struct{}) 382 defer close(doneCh) 383 logFn := func(err error) { 384 c.UI.Error(fmt.Sprintf("Warning, could not forward signal: %s", err)) 385 } 386 agent.ForwardSignals(cmd, logFn, doneCh) 387 388 // Setup the child info 389 c.child = cmd.Process 390 c.childLock.Unlock() 391 392 // Wait for the child process 393 if err := cmd.Wait(); err != nil { 394 c.UI.Error(fmt.Sprintf("Error running handler: %s", err)) 395 return err 396 } 397 return nil 398 } 399 400 // killChild is used to forcefully kill the child, first using SIGTERM 401 // to allow for a graceful cleanup and then using SIGKILL for a hard 402 // termination. 403 // On Windows, the child is always hard terminated with a SIGKILL, even 404 // on the first attempt. 405 func (c *cmd) killChild(childErr chan error) error { 406 // Get the child process 407 child := c.child 408 409 // If there is no child process (failed to start), we can quit early 410 if child == nil { 411 if c.verbose { 412 c.UI.Info("No child process to kill") 413 } 414 return nil 415 } 416 417 // Attempt termination first 418 if c.verbose { 419 c.UI.Info(fmt.Sprintf("Terminating child pid %d", child.Pid)) 420 } 421 if err := signalPid(child.Pid, syscall.SIGTERM); err != nil { 422 return fmt.Errorf("Failed to terminate %d: %v", child.Pid, err) 423 } 424 425 // Wait for termination, or until a timeout 426 select { 427 case <-childErr: 428 if c.verbose { 429 c.UI.Info("Child terminated") 430 } 431 return nil 432 case <-time.After(lockKillGracePeriod): 433 if c.verbose { 434 c.UI.Info(fmt.Sprintf("Child did not exit after grace period of %v", 435 lockKillGracePeriod)) 436 } 437 } 438 439 // Send a final SIGKILL 440 if c.verbose { 441 c.UI.Info(fmt.Sprintf("Killing child pid %d", child.Pid)) 442 } 443 if err := signalPid(child.Pid, syscall.SIGKILL); err != nil { 444 return fmt.Errorf("Failed to kill %d: %v", child.Pid, err) 445 } 446 return nil 447 } 448 449 func (c *cmd) Synopsis() string { 450 return synopsis 451 } 452 453 func (c *cmd) Help() string { 454 return c.help 455 } 456 457 // LockUnlock is used to abstract over the differences between 458 // a lock and a semaphore. 459 type LockUnlock struct { 460 lockFn func(<-chan struct{}) (<-chan struct{}, error) 461 unlockFn func() error 462 cleanupFn func() error 463 inUseErr error 464 rawOpts interface{} 465 } 466 467 const synopsis = "Execute a command holding a lock" 468 const help = ` 469 Usage: consul lock [options] prefix child... 470 471 Acquires a lock or semaphore at a given path, and invokes a child process 472 when successful. The child process can assume the lock is held while it 473 executes. If the lock is lost or communication is disrupted the child 474 process will be sent a SIGTERM signal and given time to gracefully exit. 475 After the grace period expires the process will be hard terminated. 476 477 For Consul agents on Windows, the child process is always hard terminated 478 with a SIGKILL, since Windows has no POSIX compatible notion for SIGTERM. 479 480 When -n=1, only a single lock holder or leader exists providing mutual 481 exclusion. Setting a higher value switches to a semaphore allowing multiple 482 holders to coordinate. 483 484 The prefix provided must have write privileges. 485 `