github.imxd.top/hashicorp/consul@v1.4.5/agent/proxyprocess/manager.go (about)

     1  package proxyprocess
     2  
     3  import (
     4  	"fmt"
     5  	"log"
     6  	"os"
     7  	"os/exec"
     8  	"path/filepath"
     9  	"sync"
    10  	"time"
    11  
    12  	"github.com/hashicorp/consul/agent/local"
    13  	"github.com/hashicorp/consul/agent/structs"
    14  	"github.com/hashicorp/go-multierror"
    15  )
    16  
    17  const (
    18  	// ManagerCoalescePeriod and ManagerQuiescentPeriod relate to how
    19  	// notifications in updates from the local state are colaesced to prevent
    20  	// lots of churn in the manager.
    21  	//
    22  	// When the local state updates, the manager will wait for quiescence.
    23  	// For each update, the quiscence timer is reset. If the coalesce period
    24  	// is reached, the manager will update proxies regardless of the frequent
    25  	// changes. Then the whole cycle resets.
    26  	ManagerCoalescePeriod  = 5 * time.Second
    27  	ManagerQuiescentPeriod = 500 * time.Millisecond
    28  
    29  	// ManagerSnapshotPeriod is the interval that snapshots are taken.
    30  	// The last snapshot state is preserved and if it matches a file isn't
    31  	// written, so its safe for this to be reasonably frequent.
    32  	ManagerSnapshotPeriod = 1 * time.Second
    33  )
    34  
    35  // Manager starts, stops, snapshots, and restores managed proxies.
    36  //
    37  // The manager will not start or stop any processes until Start is called.
    38  // Prior to this, any configuration, snapshot loading, etc. can be done.
    39  // Even if a process is no longer running after loading the snapshot, it
    40  // will not be restarted until Start is called.
    41  //
    42  // The Manager works by subscribing to change notifications on a local.State
    43  // structure. Whenever a change is detected, the Manager syncs its internal
    44  // state with the local.State and starts/stops any necessary proxies. The
    45  // manager never holds a lock on local.State (except to read the proxies)
    46  // and state updates may occur while the Manger is syncing. This is okay,
    47  // since a change notification will be queued to trigger another sync.
    48  //
    49  // The change notifications from the local state are coalesced (see
    50  // ManagerCoalescePeriod) so that frequent changes within the local state
    51  // do not trigger dozens of proxy resyncs.
    52  type Manager struct {
    53  	// State is the local state that is the source of truth for all
    54  	// configured managed proxies.
    55  	State *local.State
    56  
    57  	// Logger is the logger for information about manager behavior.
    58  	// Output for proxies will not go here generally but varies by proxy
    59  	// implementation type.
    60  	Logger *log.Logger
    61  
    62  	// DataDir is the path to the directory where data for proxies is
    63  	// written, including snapshots for any state changes in the manager.
    64  	// Within the data dir, files will be written in the following locatins:
    65  	//
    66  	//   * logs/ - log files named <service id>-std{out|err}.log
    67  	//   * pids/ - pid files for daemons named <service id>.pid
    68  	//   * snapshot.json - the state of the manager
    69  	//
    70  	DataDir string
    71  
    72  	// Extra environment variables to set for the proxies
    73  	ProxyEnv []string
    74  
    75  	// SnapshotPeriod is the duration between snapshots. This can be set
    76  	// relatively low to ensure accuracy, because if the new snapshot matches
    77  	// the last snapshot taken, no file will be written. Therefore, setting
    78  	// this low causes only slight CPU/memory usage but doesn't result in
    79  	// disk IO. If this isn't set, ManagerSnapshotPeriod will be the default.
    80  	//
    81  	// This only has an effect if snapshots are enabled (DataDir is set).
    82  	SnapshotPeriod time.Duration
    83  
    84  	// CoalescePeriod and QuiescencePeriod control the timers for coalescing
    85  	// updates from the local state. See the defaults at the top of this
    86  	// file for more documentation. These will be set to those defaults
    87  	// by NewManager.
    88  	CoalescePeriod  time.Duration
    89  	QuiescentPeriod time.Duration
    90  
    91  	// AllowRoot configures whether proxies can be executed as root (EUID == 0).
    92  	// If this is false then the manager will run and proxies can be added
    93  	// and removed but none will be started an errors will be logged
    94  	// to the logger.
    95  	AllowRoot bool
    96  
    97  	// lock is held while reading/writing any internal state of the manager.
    98  	// cond is a condition variable on lock that is broadcasted for runState
    99  	// changes.
   100  	lock *sync.Mutex
   101  	cond *sync.Cond
   102  
   103  	// runState is the current state of the manager. To read this the
   104  	// lock must be held. The condition variable cond can be waited on
   105  	// for changes to this value.
   106  	runState managerRunState
   107  
   108  	// lastSnapshot stores a pointer to the last snapshot that successfully
   109  	// wrote to disk. This is used for dup detection to prevent rewriting
   110  	// the same snapshot multiple times. snapshots should never be that
   111  	// large so keeping it in-memory should be cheap even for thousands of
   112  	// proxies (unlikely scenario).
   113  	lastSnapshot *snapshot
   114  
   115  	proxies map[string]Proxy
   116  }
   117  
   118  // NewManager initializes a Manager. After initialization, the exported
   119  // fields should be configured as desired. To start the Manager, execute
   120  // Run in a goroutine.
   121  func NewManager() *Manager {
   122  	var lock sync.Mutex
   123  	return &Manager{
   124  		Logger:          defaultLogger,
   125  		SnapshotPeriod:  ManagerSnapshotPeriod,
   126  		CoalescePeriod:  ManagerCoalescePeriod,
   127  		QuiescentPeriod: ManagerQuiescentPeriod,
   128  		lock:            &lock,
   129  		cond:            sync.NewCond(&lock),
   130  		proxies:         make(map[string]Proxy),
   131  	}
   132  }
   133  
   134  // defaultLogger is the defaultLogger for NewManager so there it is never nil
   135  var defaultLogger = log.New(os.Stderr, "", log.LstdFlags)
   136  
   137  // managerRunState is the state of the Manager.
   138  //
   139  // This is a basic state machine with the following transitions:
   140  //
   141  //   * idle     => running, stopped
   142  //   * running  => stopping, stopped
   143  //   * stopping => stopped
   144  //   * stopped  => <>
   145  //
   146  type managerRunState uint8
   147  
   148  const (
   149  	managerStateIdle managerRunState = iota
   150  	managerStateRunning
   151  	managerStateStopping
   152  	managerStateStopped
   153  )
   154  
   155  // Close stops the manager. Managed processes are NOT stopped.
   156  func (m *Manager) Close() error {
   157  	m.lock.Lock()
   158  	defer m.lock.Unlock()
   159  
   160  	return m.stop(func(p Proxy) error {
   161  		return p.Close()
   162  	})
   163  }
   164  
   165  // Kill will Close the manager and Kill all proxies that were being managed.
   166  // Only ONE of Kill or Close must be called. If Close has been called already
   167  // then this will have no effect.
   168  func (m *Manager) Kill() error {
   169  	m.lock.Lock()
   170  	defer m.lock.Unlock()
   171  
   172  	return m.stop(func(p Proxy) error {
   173  		return p.Stop()
   174  	})
   175  }
   176  
   177  // stop stops the run loop and cleans up all the proxies by calling
   178  // the given cleaner. If the cleaner returns an error the proxy won't be
   179  // removed from the map.
   180  //
   181  // The lock must be held while this is called.
   182  func (m *Manager) stop(cleaner func(Proxy) error) error {
   183  	for {
   184  		// Special case state that exits the for loop
   185  		if m.runState == managerStateStopped {
   186  			break
   187  		}
   188  
   189  		switch m.runState {
   190  		case managerStateIdle:
   191  			// Idle so just set it to stopped and return. We notify
   192  			// the condition variable in case others are waiting.
   193  			m.runState = managerStateStopped
   194  			m.cond.Broadcast()
   195  			return nil
   196  
   197  		case managerStateRunning:
   198  			// Set the state to stopping and broadcast to all waiters,
   199  			// since Run is sitting on cond.Wait.
   200  			m.runState = managerStateStopping
   201  			m.cond.Broadcast()
   202  			m.cond.Wait() // Wait on the stopping event
   203  
   204  		case managerStateStopping:
   205  			// Still stopping, wait...
   206  			m.cond.Wait()
   207  		}
   208  	}
   209  
   210  	// Clean up all the proxies
   211  	var err error
   212  	for id, proxy := range m.proxies {
   213  		if err := cleaner(proxy); err != nil {
   214  			err = multierror.Append(
   215  				err, fmt.Errorf("failed to stop proxy %q: %s", id, err))
   216  			continue
   217  		}
   218  
   219  		// Remove it since it is already stopped successfully
   220  		delete(m.proxies, id)
   221  	}
   222  
   223  	return err
   224  }
   225  
   226  // Run syncs with the local state and supervises existing proxies.
   227  //
   228  // This blocks and should be run in a goroutine. If another Run is already
   229  // executing, this will do nothing and return.
   230  func (m *Manager) Run() {
   231  	m.lock.Lock()
   232  	if m.runState != managerStateIdle {
   233  		m.lock.Unlock()
   234  		return
   235  	}
   236  
   237  	// Set the state to running
   238  	m.runState = managerStateRunning
   239  	m.lock.Unlock()
   240  
   241  	// Start a goroutine that just waits for a stop request
   242  	stopCh := make(chan struct{})
   243  	go func() {
   244  		defer close(stopCh)
   245  		m.lock.Lock()
   246  		defer m.lock.Unlock()
   247  
   248  		// We wait for anything not running, just so we're more resilient
   249  		// in the face of state machine issues. Basically any state change
   250  		// will cause us to quit.
   251  		for m.runState == managerStateRunning {
   252  			m.cond.Wait()
   253  		}
   254  	}()
   255  
   256  	// When we exit, we set the state to stopped and broadcast to any
   257  	// waiting Close functions that they can return.
   258  	defer func() {
   259  		m.lock.Lock()
   260  		m.runState = managerStateStopped
   261  		m.cond.Broadcast()
   262  		m.lock.Unlock()
   263  	}()
   264  
   265  	// Register for proxy catalog change notifications
   266  	notifyCh := make(chan struct{}, 1)
   267  	m.State.NotifyProxy(notifyCh)
   268  	defer m.State.StopNotifyProxy(notifyCh)
   269  
   270  	// Start the timer for snapshots. We don't use a ticker because disk
   271  	// IO can be slow and we don't want overlapping notifications. So we only
   272  	// reset the timer once the snapshot is complete rather than continuously.
   273  	snapshotTimer := time.NewTimer(m.SnapshotPeriod)
   274  	defer snapshotTimer.Stop()
   275  
   276  	m.Logger.Println("[DEBUG] agent/proxy: managed Connect proxy manager started")
   277  SYNC:
   278  	for {
   279  		// Sync first, before waiting on further notifications so that
   280  		// we can start with a known-current state.
   281  		m.sync()
   282  
   283  		// Note for these variables we don't use a time.Timer because both
   284  		// periods are relatively short anyways so they end up being eligible
   285  		// for GC very quickly, so overhead is not a concern.
   286  		var quiescent, quantum <-chan time.Time
   287  
   288  		// Start a loop waiting for events from the local state store. This
   289  		// loops rather than just `select` so we can coalesce many state
   290  		// updates over a period of time.
   291  		for {
   292  			select {
   293  			case <-notifyCh:
   294  				// If this is our first notification since the last sync,
   295  				// reset the quantum timer which is the max time we'll wait.
   296  				if quantum == nil {
   297  					quantum = time.After(m.CoalescePeriod)
   298  				}
   299  
   300  				// Always reset the quiescent timer
   301  				quiescent = time.After(m.QuiescentPeriod)
   302  
   303  			case <-quantum:
   304  				continue SYNC
   305  
   306  			case <-quiescent:
   307  				continue SYNC
   308  
   309  			case <-snapshotTimer.C:
   310  				// Perform a snapshot
   311  				if path := m.SnapshotPath(); path != "" {
   312  					if err := m.snapshot(path, true); err != nil {
   313  						m.Logger.Printf("[WARN] agent/proxy: failed to snapshot state: %s", err)
   314  					}
   315  				}
   316  
   317  				// Reset
   318  				snapshotTimer.Reset(m.SnapshotPeriod)
   319  
   320  			case <-stopCh:
   321  				// Stop immediately, no cleanup
   322  				m.Logger.Println("[DEBUG] agent/proxy: Stopping managed Connect proxy manager")
   323  				return
   324  			}
   325  		}
   326  	}
   327  }
   328  
   329  // sync syncs data with the local state store to update the current manager
   330  // state and start/stop necessary proxies.
   331  func (m *Manager) sync() {
   332  	m.lock.Lock()
   333  	defer m.lock.Unlock()
   334  
   335  	// If we don't allow root and we're root, then log a high sev message.
   336  	if !m.AllowRoot && isRoot() {
   337  		m.Logger.Println("[WARN] agent/proxy: running as root, will not start managed proxies")
   338  		return
   339  	}
   340  
   341  	// Get the current set of proxies
   342  	state := m.State.Proxies()
   343  
   344  	// Go through our existing proxies that we're currently managing to
   345  	// determine if they're still in the state or not. If they're in the
   346  	// state, we need to diff to determine if we're starting a new proxy
   347  	// If they're not in the state, then we need to stop the proxy since it
   348  	// is now orphaned.
   349  	for id, proxy := range m.proxies {
   350  		// Get the proxy.
   351  		stateProxy, ok := state[id]
   352  		if ok {
   353  			// Remove the proxy from the state so we don't start it new.
   354  			delete(state, id)
   355  
   356  			// Make the proxy so we can compare. This does not start it.
   357  			proxy2, err := m.newProxy(stateProxy)
   358  			if err != nil {
   359  				m.Logger.Printf("[ERROR] agent/proxy: failed to initialize proxy for %q: %s", id, err)
   360  				continue
   361  			}
   362  
   363  			// If the proxies are equal, then do nothing
   364  			if proxy.Equal(proxy2) {
   365  				continue
   366  			}
   367  
   368  			// Proxies are not equal, so we should stop it. We add it
   369  			// back to the state here (unlikely case) so the loop below starts
   370  			// the new one.
   371  			state[id] = stateProxy
   372  
   373  			// Continue out of `if` as if proxy didn't exist so we stop it
   374  		}
   375  
   376  		// Proxy is deregistered. Remove it from our map and stop it
   377  		delete(m.proxies, id)
   378  		if err := proxy.Stop(); err != nil {
   379  			m.Logger.Printf("[ERROR] agent/proxy: failed to stop deregistered proxy for %q: %s", id, err)
   380  		}
   381  	}
   382  
   383  	// Remaining entries in state are new proxies. Start them!
   384  	for id, stateProxy := range state {
   385  		proxy, err := m.newProxy(stateProxy)
   386  		if err != nil {
   387  			m.Logger.Printf("[ERROR] agent/proxy: failed to initialize proxy for %q: %s", id, err)
   388  			continue
   389  		}
   390  
   391  		if err := proxy.Start(); err != nil {
   392  			m.Logger.Printf("[ERROR] agent/proxy: failed to start proxy for %q: %s", id, err)
   393  			continue
   394  		}
   395  
   396  		m.proxies[id] = proxy
   397  	}
   398  }
   399  
   400  // newProxy creates the proper Proxy implementation for the configured
   401  // local managed proxy.
   402  func (m *Manager) newProxy(mp *local.ManagedProxy) (Proxy, error) {
   403  	// Defensive because the alternative is to panic which is not desired
   404  	if mp == nil || mp.Proxy == nil {
   405  		return nil, fmt.Errorf("internal error: nil *local.ManagedProxy or Proxy field")
   406  	}
   407  	p := mp.Proxy
   408  
   409  	// We reuse the service ID a few times
   410  	id := p.ProxyService.ID
   411  
   412  	// Create the Proxy. We could just as easily switch on p.ExecMode
   413  	// but I wanted there to be only location where ExecMode => Proxy so
   414  	// it lowers the chance that is wrong.
   415  	proxy, err := m.newProxyFromMode(p.ExecMode, id)
   416  	if err != nil {
   417  		return nil, err
   418  	}
   419  
   420  	// Depending on the proxy type we configure the rest from our ManagedProxy
   421  	switch proxy := proxy.(type) {
   422  	case *Daemon:
   423  		command := p.Command
   424  
   425  		// This should never happen since validation should happen upstream
   426  		// but verify it because the alternative is to panic below.
   427  		if len(command) == 0 {
   428  			return nil, fmt.Errorf("daemon mode managed proxy requires command")
   429  		}
   430  
   431  		// Build the command to execute.
   432  		var cmd exec.Cmd
   433  		cmd.Path = command[0]
   434  		cmd.Args = command // idx 0 is path but preserved since it should be
   435  		if err := m.configureLogDir(id, &cmd); err != nil {
   436  			return nil, fmt.Errorf("error configuring proxy logs: %s", err)
   437  		}
   438  
   439  		// Pass in the environmental variables for the proxy process
   440  		cmd.Env = append(m.ProxyEnv, os.Environ()...)
   441  
   442  		// Build the daemon structure
   443  		proxy.Command = &cmd
   444  		proxy.ProxyID = id
   445  		proxy.ProxyToken = mp.ProxyToken
   446  		return proxy, nil
   447  
   448  	default:
   449  		return nil, fmt.Errorf("unsupported managed proxy type: %q", p.ExecMode)
   450  	}
   451  }
   452  
   453  // newProxyFromMode just initializes the proxy structure from only the mode
   454  // and the service ID. This is a shared method between newProxy and Restore
   455  // so that we only have one location where we turn ExecMode into a Proxy.
   456  func (m *Manager) newProxyFromMode(mode structs.ProxyExecMode, id string) (Proxy, error) {
   457  	switch mode {
   458  	case structs.ProxyExecModeDaemon:
   459  		return &Daemon{
   460  			Logger:  m.Logger,
   461  			PidPath: pidPath(filepath.Join(m.DataDir, "pids"), id),
   462  		}, nil
   463  
   464  	default:
   465  		return nil, fmt.Errorf("unsupported managed proxy type: %q", mode)
   466  	}
   467  }
   468  
   469  // configureLogDir sets up the file descriptors to stdout/stderr so that
   470  // they log to the proper file path for the given service ID.
   471  func (m *Manager) configureLogDir(id string, cmd *exec.Cmd) error {
   472  	// Create the log directory
   473  	logDir := ""
   474  	if m.DataDir != "" {
   475  		logDir = filepath.Join(m.DataDir, "logs")
   476  		if err := os.MkdirAll(logDir, 0700); err != nil {
   477  			return err
   478  		}
   479  	}
   480  
   481  	// Configure the stdout, stderr paths
   482  	stdoutPath := logPath(logDir, id, "stdout")
   483  	stderrPath := logPath(logDir, id, "stderr")
   484  
   485  	// Open the files. We want to append to each. We expect these files
   486  	// to be rotated by some external process.
   487  	stdoutF, err := os.OpenFile(stdoutPath, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0600)
   488  	if err != nil {
   489  		return fmt.Errorf("error creating stdout file: %s", err)
   490  	}
   491  	stderrF, err := os.OpenFile(stderrPath, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0600)
   492  	if err != nil {
   493  		// Don't forget to close stdoutF which successfully opened
   494  		stdoutF.Close()
   495  
   496  		return fmt.Errorf("error creating stderr file: %s", err)
   497  	}
   498  
   499  	cmd.Stdout = stdoutF
   500  	cmd.Stderr = stderrF
   501  	return nil
   502  }
   503  
   504  // logPath is a helper to return the path to the log file for the given
   505  // directory, service ID, and stream type (stdout or stderr).
   506  func logPath(dir, id, stream string) string {
   507  	return filepath.Join(dir, fmt.Sprintf("%s-%s.log", id, stream))
   508  }
   509  
   510  // pidPath is a helper to return the path to the pid file for the given
   511  // directory and service ID.
   512  func pidPath(dir, id string) string {
   513  	// If no directory is given we do not write a pid
   514  	if dir == "" {
   515  		return ""
   516  	}
   517  
   518  	return filepath.Join(dir, fmt.Sprintf("%s.pid", id))
   519  }