github.com/outbrain/consul@v1.4.5/agent/proxyprocess/daemon.go (about)

     1  package proxyprocess
     2  
     3  import (
     4  	"fmt"
     5  	"log"
     6  	"os"
     7  	"os/exec"
     8  	"reflect"
     9  	"strconv"
    10  	"sync"
    11  	"time"
    12  
    13  	"github.com/hashicorp/consul/lib/file"
    14  	"github.com/mitchellh/mapstructure"
    15  )
    16  
    17  // Constants related to restart timers with the daemon mode proxies. At some
    18  // point we will probably want to expose these knobs to an end user, but
    19  // reasonable defaults are chosen.
    20  const (
    21  	DaemonRestartHealthy    = 10 * time.Second // time before considering healthy
    22  	DaemonRestartBackoffMin = 3                // 3 attempts before backing off
    23  	DaemonRestartMaxWait    = 1 * time.Minute  // maximum backoff wait time
    24  )
    25  
    26  // Daemon is a long-running proxy process. It is expected to keep running
    27  // and to use blocking queries to detect changes in configuration, certs,
    28  // and more.
    29  //
    30  // Consul will ensure that if the daemon crashes, that it is restarted.
    31  type Daemon struct {
    32  	// Command is the command to execute to start this daemon. This must
    33  	// be a Cmd that isn't yet started.
    34  	Command *exec.Cmd
    35  
    36  	// ProxyID is the ID of the proxy service. This is required for API
    37  	// requests (along with the token) and is passed via env var.
    38  	ProxyID string
    39  
    40  	// ProxyToken is the special local-only ACL token that allows a proxy
    41  	// to communicate to the Connect-specific endpoints.
    42  	ProxyToken string
    43  
    44  	// Logger is where logs will be sent around the management of this
    45  	// daemon. The actual logs for the daemon itself will be sent to
    46  	// a file.
    47  	Logger *log.Logger
    48  
    49  	// PidPath is the path where a pid file will be created storing the
    50  	// pid of the active process. If this is empty then a pid-file won't
    51  	// be created. Under erroneous conditions, the pid file may not be
    52  	// created but the error will be logged to the Logger.
    53  	PidPath string
    54  
    55  	// For tests, they can set this to change the default duration to wait
    56  	// for a graceful quit.
    57  	gracefulWait time.Duration
    58  
    59  	// process is the started process
    60  	lock     sync.Mutex
    61  	stopped  bool
    62  	stopCh   chan struct{}
    63  	exitedCh chan struct{}
    64  	process  *os.Process
    65  }
    66  
    67  // Start starts the daemon and keeps it running.
    68  //
    69  // This function returns after the process is successfully started.
    70  func (p *Daemon) Start() error {
    71  	p.lock.Lock()
    72  	defer p.lock.Unlock()
    73  
    74  	// A stopped proxy cannot be restarted
    75  	if p.stopped {
    76  		return fmt.Errorf("stopped")
    77  	}
    78  
    79  	// If we're already running, that is okay
    80  	if p.process != nil {
    81  		return nil
    82  	}
    83  
    84  	// Setup our stop channel
    85  	stopCh := make(chan struct{})
    86  	exitedCh := make(chan struct{})
    87  	p.stopCh = stopCh
    88  	p.exitedCh = exitedCh
    89  
    90  	// Start the loop.
    91  	go p.keepAlive(stopCh, exitedCh)
    92  
    93  	return nil
    94  }
    95  
    96  // keepAlive starts and keeps the configured process alive until it
    97  // is stopped via Stop.
    98  func (p *Daemon) keepAlive(stopCh <-chan struct{}, exitedCh chan<- struct{}) {
    99  	defer close(exitedCh)
   100  
   101  	p.lock.Lock()
   102  	process := p.process
   103  	p.lock.Unlock()
   104  
   105  	// attemptsDeadline is the time at which we consider the daemon to have
   106  	// been alive long enough that we can reset the attempt counter.
   107  	//
   108  	// attempts keeps track of the number of restart attempts we've had and
   109  	// is used to calculate the wait time using an exponential backoff.
   110  	var attemptsDeadline time.Time
   111  	var attempts uint32
   112  
   113  	// Assume the process is adopted, we reset this when we start a new process
   114  	// ourselves below and use it to decide on a strategy for waiting.
   115  	adopted := true
   116  
   117  	for {
   118  		if process == nil {
   119  			// If we're passed the attempt deadline then reset the attempts
   120  			if !attemptsDeadline.IsZero() && time.Now().After(attemptsDeadline) {
   121  				attempts = 0
   122  			}
   123  			// Set ourselves a deadline - we have to make it at least this long before
   124  			// we come around the loop to consider it to have been a "successful"
   125  			// daemon startup and rest the counter above. Note that if the daemon
   126  			// fails before this, we reset the deadline to zero below so that backoff
   127  			// sleeps in the loop don't count as "success" time.
   128  			attemptsDeadline = time.Now().Add(DaemonRestartHealthy)
   129  			attempts++
   130  
   131  			// Calculate the exponential backoff and wait if we have to
   132  			if attempts > DaemonRestartBackoffMin {
   133  				exponent := (attempts - DaemonRestartBackoffMin)
   134  				if exponent > 31 {
   135  					exponent = 31
   136  				}
   137  				waitTime := (1 << exponent) * time.Second
   138  				if waitTime > DaemonRestartMaxWait {
   139  					waitTime = DaemonRestartMaxWait
   140  				}
   141  
   142  				if waitTime > 0 {
   143  					// If we are waiting, reset the success deadline so we don't
   144  					// accidentally interpret backoff sleep as successful runtime.
   145  					attemptsDeadline = time.Time{}
   146  
   147  					p.Logger.Printf(
   148  						"[WARN] agent/proxy: waiting %s before restarting daemon",
   149  						waitTime)
   150  
   151  					timer := time.NewTimer(waitTime)
   152  					select {
   153  					case <-timer.C:
   154  						// Timer is up, good!
   155  
   156  					case <-stopCh:
   157  						// During our backoff wait, we've been signaled to
   158  						// quit, so just quit.
   159  						timer.Stop()
   160  						return
   161  					}
   162  				}
   163  			}
   164  
   165  			p.lock.Lock()
   166  
   167  			// If we gracefully stopped then don't restart.
   168  			if p.stopped {
   169  				p.lock.Unlock()
   170  				return
   171  			}
   172  
   173  			// Process isn't started currently. We're restarting. Start it
   174  			// and save the process if we have it.
   175  			var err error
   176  			process, err = p.start()
   177  			if err == nil {
   178  				p.process = process
   179  				adopted = false
   180  			}
   181  			p.lock.Unlock()
   182  
   183  			if err != nil {
   184  				p.Logger.Printf("[ERR] agent/proxy: error restarting daemon: %s", err)
   185  				continue
   186  			}
   187  
   188  		}
   189  
   190  		var ps *os.ProcessState
   191  		var err error
   192  
   193  		if adopted {
   194  			// assign to err outside scope
   195  			_, err = findProcess(process.Pid)
   196  			if err == nil {
   197  				// Process appears to be running still, wait a bit before we poll again.
   198  				// We want a busy loop, but not too busy. 1 second between detecting a
   199  				// process death seems reasonable.
   200  				//
   201  				// SUBTLETY: we must NOT select on stopCh here since the Stop function
   202  				// assumes that as soon as this method returns and closes exitedCh, that
   203  				// the process is no longer running. If we are polling then we don't
   204  				// know that is true until we've polled again so we have to keep polling
   205  				// until the process goes away even if we know the Daemon is stopping.
   206  				time.Sleep(1 * time.Second)
   207  
   208  				// Restart the loop, process is still set so we effectively jump back to
   209  				// the findProcess call above.
   210  				continue
   211  			}
   212  		} else {
   213  			// Wait for child to exit
   214  			ps, err = process.Wait()
   215  		}
   216  
   217  		// Process exited somehow.
   218  		process = nil
   219  		if err != nil {
   220  			p.Logger.Printf("[INFO] agent/proxy: daemon exited with error: %s", err)
   221  		} else if ps != nil && !ps.Exited() {
   222  			p.Logger.Printf("[INFO] agent/proxy: daemon left running")
   223  		} else if status, ok := exitStatus(ps); ok {
   224  			p.Logger.Printf("[INFO] agent/proxy: daemon exited with exit code: %d", status)
   225  		}
   226  	}
   227  }
   228  
   229  // start starts and returns the process. This will create a copy of the
   230  // configured *exec.Command with the modifications documented on Daemon
   231  // such as setting the proxy token environmental variable.
   232  func (p *Daemon) start() (*os.Process, error) {
   233  	cmd := *p.Command
   234  
   235  	// Add the proxy token to the environment. We first copy the env because it is
   236  	// a slice and therefore the "copy" above will only copy the slice reference.
   237  	// We allocate an exactly sized slice.
   238  	//
   239  	// Note that anything we add to the Env here is NOT persisted in the snapshot
   240  	// which only looks at p.Command.Env so it needs to be reconstructible exactly
   241  	// from data in the snapshot otherwise.
   242  	cmd.Env = make([]string, len(p.Command.Env), len(p.Command.Env)+2)
   243  	copy(cmd.Env, p.Command.Env)
   244  	cmd.Env = append(cmd.Env,
   245  		fmt.Sprintf("%s=%s", EnvProxyID, p.ProxyID),
   246  		fmt.Sprintf("%s=%s", EnvProxyToken, p.ProxyToken))
   247  
   248  	// Update the Daemon env
   249  
   250  	// Args must always contain a 0 entry which is usually the executed binary.
   251  	// To be safe and a bit more robust we default this, but only to prevent
   252  	// a panic below.
   253  	if len(cmd.Args) == 0 {
   254  		cmd.Args = []string{cmd.Path}
   255  	}
   256  
   257  	// Perform system-specific setup. In particular, Unix-like systems
   258  	// shuld set sid so that killing the agent doesn't kill the daemon.
   259  	configureDaemon(&cmd)
   260  
   261  	// Start it
   262  	p.Logger.Printf("[DEBUG] agent/proxy: starting proxy: %q %#v", cmd.Path, cmd.Args[1:])
   263  	if err := cmd.Start(); err != nil {
   264  		return nil, err
   265  	}
   266  
   267  	// Write the pid file. This might error and that's okay.
   268  	if p.PidPath != "" {
   269  		pid := strconv.FormatInt(int64(cmd.Process.Pid), 10)
   270  		if err := file.WriteAtomic(p.PidPath, []byte(pid)); err != nil {
   271  			p.Logger.Printf(
   272  				"[DEBUG] agent/proxy: error writing pid file %q: %s",
   273  				p.PidPath, err)
   274  		}
   275  	}
   276  
   277  	return cmd.Process, nil
   278  }
   279  
   280  // Stop stops the daemon.
   281  //
   282  // This will attempt a graceful stop (SIGINT) before force killing the
   283  // process (SIGKILL). In either case, the process won't be automatically
   284  // restarted unless Start is called again.
   285  //
   286  // This is safe to call multiple times. If the daemon is already stopped,
   287  // then this returns no error.
   288  func (p *Daemon) Stop() error {
   289  	p.lock.Lock()
   290  
   291  	// If we're already stopped or never started, then no problem.
   292  	if p.stopped || p.process == nil {
   293  		// In the case we never even started, calling Stop makes it so
   294  		// that we can't ever start in the future, either, so mark this.
   295  		p.stopped = true
   296  		p.lock.Unlock()
   297  		return nil
   298  	}
   299  
   300  	// Note that we've stopped
   301  	p.stopped = true
   302  	close(p.stopCh)
   303  	process := p.process
   304  	p.lock.Unlock()
   305  
   306  	gracefulWait := p.gracefulWait
   307  	if gracefulWait == 0 {
   308  		gracefulWait = 5 * time.Second
   309  	}
   310  
   311  	// Defer removing the pid file. Even under error conditions we
   312  	// delete the pid file since Stop means that the manager is no
   313  	// longer managing this proxy and therefore nothing else will ever
   314  	// clean it up.
   315  	if p.PidPath != "" {
   316  		defer func() {
   317  			if err := os.Remove(p.PidPath); err != nil && !os.IsNotExist(err) {
   318  				p.Logger.Printf(
   319  					"[DEBUG] agent/proxy: error removing pid file %q: %s",
   320  					p.PidPath, err)
   321  			}
   322  		}()
   323  	}
   324  
   325  	// First, try a graceful stop
   326  	err := process.Signal(os.Interrupt)
   327  	if err == nil {
   328  		select {
   329  		case <-p.exitedCh:
   330  			// Success!
   331  			return nil
   332  
   333  		case <-time.After(gracefulWait):
   334  			// Interrupt didn't work
   335  			p.Logger.Printf("[DEBUG] agent/proxy: graceful wait of %s passed, "+
   336  				"killing", gracefulWait)
   337  		}
   338  	} else if isProcessAlreadyFinishedErr(err) {
   339  		// This can happen due to races between signals and polling.
   340  		return nil
   341  	} else {
   342  		p.Logger.Printf("[DEBUG] agent/proxy: sigint failed, killing: %s", err)
   343  	}
   344  
   345  	// Graceful didn't work (e.g. on windows where SIGINT isn't implemented),
   346  	// forcibly kill
   347  	err = process.Kill()
   348  	if err != nil && isProcessAlreadyFinishedErr(err) {
   349  		return nil
   350  	}
   351  	return err
   352  }
   353  
   354  // Close implements Proxy by stopping the run loop but not killing the process.
   355  // One Close is called, Stop has no effect.
   356  func (p *Daemon) Close() error {
   357  	p.lock.Lock()
   358  	defer p.lock.Unlock()
   359  
   360  	// If we're already stopped or never started, then no problem.
   361  	if p.stopped || p.process == nil {
   362  		p.stopped = true
   363  		return nil
   364  	}
   365  
   366  	// Note that we've stopped
   367  	p.stopped = true
   368  	close(p.stopCh)
   369  
   370  	return nil
   371  }
   372  
   373  // Equal implements Proxy to check for equality.
   374  func (p *Daemon) Equal(raw Proxy) bool {
   375  	p2, ok := raw.(*Daemon)
   376  	if !ok {
   377  		return false
   378  	}
   379  
   380  	// We compare equality on a subset of the command configuration
   381  	return p.ProxyToken == p2.ProxyToken &&
   382  		p.ProxyID == p2.ProxyID &&
   383  		p.Command.Path == p2.Command.Path &&
   384  		p.Command.Dir == p2.Command.Dir &&
   385  		reflect.DeepEqual(p.Command.Args, p2.Command.Args) &&
   386  		reflect.DeepEqual(p.Command.Env, p2.Command.Env)
   387  }
   388  
   389  // MarshalSnapshot implements Proxy
   390  func (p *Daemon) MarshalSnapshot() map[string]interface{} {
   391  	p.lock.Lock()
   392  	defer p.lock.Unlock()
   393  
   394  	// If we're stopped or have no process, then nothing to snapshot.
   395  	if p.stopped || p.process == nil {
   396  		return nil
   397  	}
   398  
   399  	return map[string]interface{}{
   400  		"Pid":         p.process.Pid,
   401  		"CommandPath": p.Command.Path,
   402  		"CommandArgs": p.Command.Args,
   403  		"CommandDir":  p.Command.Dir,
   404  		"CommandEnv":  p.Command.Env,
   405  		"ProxyToken":  p.ProxyToken,
   406  		"ProxyID":     p.ProxyID,
   407  	}
   408  }
   409  
   410  // UnmarshalSnapshot implements Proxy
   411  func (p *Daemon) UnmarshalSnapshot(m map[string]interface{}) error {
   412  	var s daemonSnapshot
   413  	if err := mapstructure.Decode(m, &s); err != nil {
   414  		return err
   415  	}
   416  
   417  	p.lock.Lock()
   418  	defer p.lock.Unlock()
   419  
   420  	// Set the basic fields
   421  	p.ProxyToken = s.ProxyToken
   422  	p.ProxyID = s.ProxyID
   423  	p.Command = &exec.Cmd{
   424  		Path: s.CommandPath,
   425  		Args: s.CommandArgs,
   426  		Dir:  s.CommandDir,
   427  		Env:  s.CommandEnv,
   428  	}
   429  
   430  	// FindProcess on many systems returns no error even if the process
   431  	// is now dead. We perform an extra check that the process is alive.
   432  	proc, err := findProcess(s.Pid)
   433  	if err != nil {
   434  		return err
   435  	}
   436  
   437  	// "Start it"
   438  	stopCh := make(chan struct{})
   439  	exitedCh := make(chan struct{})
   440  	p.stopCh = stopCh
   441  	p.exitedCh = exitedCh
   442  	p.process = proc
   443  	go p.keepAlive(stopCh, exitedCh)
   444  
   445  	return nil
   446  }
   447  
   448  // daemonSnapshot is the structure of the marshaled data for snapshotting.
   449  //
   450  // Note we don't have to store the ProxyId because this is stored directly
   451  // within the manager snapshot and is restored automatically.
   452  type daemonSnapshot struct {
   453  	// Pid of the process. This is the only value actually required to
   454  	// regain management control. The remainder values are for Equal.
   455  	Pid int
   456  
   457  	// Command information
   458  	CommandPath string
   459  	CommandArgs []string
   460  	CommandDir  string
   461  	CommandEnv  []string
   462  
   463  	// NOTE(mitchellh): longer term there are discussions/plans to only
   464  	// store the hash of the token but for now we need the full token in
   465  	// case the process dies and has to be restarted.
   466  	ProxyToken string
   467  
   468  	ProxyID string
   469  }