github.com/whamcloud/lemur@v0.0.0-20190827193804-4655df8a52af/cmd/lhsmd/agent/plugin.go (about)

     1  // Copyright (c) 2018 DDN. All rights reserved.
     2  // Use of this source code is governed by a MIT-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package agent
     6  
     7  import (
     8  	"bytes"
     9  	"encoding/json"
    10  	"os"
    11  	"os/exec"
    12  	"path"
    13  	"time"
    14  
    15  	"github.com/pkg/errors"
    16  
    17  	"golang.org/x/net/context"
    18  
    19  	"github.com/intel-hpdd/lemur/cmd/lhsmd/config"
    20  	"github.com/intel-hpdd/logging/alert"
    21  	"github.com/intel-hpdd/logging/audit"
    22  	"github.com/intel-hpdd/logging/debug"
    23  )
    24  
    25  var backoff = []time.Duration{
    26  	0 * time.Second,
    27  	1 * time.Second,
    28  	10 * time.Second,
    29  	30 * time.Second,
    30  	1 * time.Minute,
    31  }
    32  var maxBackoff = len(backoff) - 1
    33  
    34  type (
    35  	// PluginConfig represents configuration for a single plugin
    36  	PluginConfig struct {
    37  		Name             string
    38  		BinPath          string
    39  		AgentConnection  string
    40  		ClientMount      string
    41  		Args             []string
    42  		RestartOnFailure bool
    43  
    44  		lastRestart  time.Time
    45  		restartCount int
    46  	}
    47  
    48  	// PluginMonitor watches monitored plugins and restarts
    49  	// them as needed.
    50  	PluginMonitor struct {
    51  		processChan      ppChan
    52  		processStateChan psChan
    53  	}
    54  
    55  	pluginProcess struct {
    56  		plugin *PluginConfig
    57  		cmd    *exec.Cmd
    58  	}
    59  
    60  	pluginStatus struct {
    61  		ps  *os.ProcessState
    62  		err error
    63  	}
    64  
    65  	ppChan chan *pluginProcess
    66  	psChan chan *pluginStatus
    67  )
    68  
    69  func (p *PluginConfig) String() string {
    70  	data, err := json.Marshal(p)
    71  	if err != nil {
    72  		alert.Abort(errors.Wrap(err, "marshal failed"))
    73  	}
    74  
    75  	var out bytes.Buffer
    76  	json.Indent(&out, data, "", "\t")
    77  	return out.String()
    78  }
    79  
    80  // NoRestart optionally sets a plugin to not be restarted on failure
    81  func (p *PluginConfig) NoRestart() *PluginConfig {
    82  	p.RestartOnFailure = false
    83  	return p
    84  }
    85  
    86  // RestartDelay returns a time.Duration to delay restarts based on
    87  // the number of restarts and the last restart time.
    88  func (p *PluginConfig) RestartDelay() time.Duration {
    89  	// If it's been a decent amount of time since the last restart,
    90  	// reset the backoff mechanism for a quick restart.
    91  	if time.Since(p.lastRestart) > backoff[maxBackoff]*2 {
    92  		p.restartCount = 0
    93  	}
    94  
    95  	if p.restartCount > maxBackoff {
    96  		return backoff[maxBackoff]
    97  	}
    98  	return backoff[p.restartCount]
    99  }
   100  
   101  // NewPlugin returns a plugin configuration
   102  func NewPlugin(name, binPath, conn, mountRoot string, args ...string) *PluginConfig {
   103  	return &PluginConfig{
   104  		Name:             name,
   105  		BinPath:          binPath,
   106  		AgentConnection:  conn,
   107  		ClientMount:      path.Join(mountRoot, name),
   108  		Args:             args,
   109  		RestartOnFailure: true,
   110  	}
   111  }
   112  
   113  // NewMonitor creates a new plugin monitor
   114  func NewMonitor() *PluginMonitor {
   115  	return &PluginMonitor{
   116  		processChan:      make(ppChan),
   117  		processStateChan: make(psChan),
   118  	}
   119  }
   120  
   121  func (m *PluginMonitor) run(ctx context.Context) {
   122  	processMap := make(map[int]*PluginConfig)
   123  
   124  	var waitForCmd = func(cmd *exec.Cmd) {
   125  		debug.Printf("Waiting for %s (%d) to exit", cmd.Path, cmd.Process.Pid)
   126  		ps, err := cmd.Process.Wait()
   127  		if err != nil {
   128  			audit.Logf("Err after Wait() for %d: %s", cmd.Process.Pid, err)
   129  		}
   130  
   131  		debug.Printf("PID %d finished: %s", cmd.Process.Pid, ps)
   132  		m.processStateChan <- &pluginStatus{ps, err}
   133  	}
   134  
   135  	for {
   136  		select {
   137  		case p := <-m.processChan:
   138  			processMap[p.cmd.Process.Pid] = p.plugin
   139  			go waitForCmd(p.cmd)
   140  		case s := <-m.processStateChan:
   141  			cfg, found := processMap[s.ps.Pid()]
   142  			if !found {
   143  				debug.Printf("Received disp of unknown pid: %d", s.ps.Pid())
   144  				break
   145  			}
   146  
   147  			delete(processMap, s.ps.Pid())
   148  			audit.Logf("Process %d for %s died: %s", s.ps.Pid(), cfg.Name, s.ps)
   149  			if cfg.RestartOnFailure {
   150  				delay := cfg.RestartDelay()
   151  				audit.Logf("Restarting plugin %s after delay of %s (attempt %d)", cfg.Name, delay, cfg.restartCount)
   152  
   153  				cfg.restartCount++
   154  				cfg.lastRestart = time.Now()
   155  				// Restart in a different goroutine to
   156  				// avoid deadlocking this one.
   157  				go func(cfg *PluginConfig, delay time.Duration) {
   158  					<-time.After(delay)
   159  
   160  					err := m.StartPlugin(cfg)
   161  					if err != nil {
   162  						audit.Logf("Failed to restart plugin %s: %s", cfg.Name, err)
   163  					}
   164  				}(cfg, delay)
   165  			}
   166  		case <-ctx.Done():
   167  			return
   168  		}
   169  	}
   170  }
   171  
   172  // Start creates a new plugin monitor
   173  func (m *PluginMonitor) Start(ctx context.Context) {
   174  	go m.run(ctx)
   175  }
   176  
   177  // StartPlugin starts the plugin and monitors it
   178  func (m *PluginMonitor) StartPlugin(cfg *PluginConfig) error {
   179  	debug.Printf("Starting %s for %s", cfg.BinPath, cfg.Name)
   180  
   181  	cmd := exec.Command(cfg.BinPath, cfg.Args...) // #nosec
   182  
   183  	prefix := path.Base(cfg.BinPath)
   184  	cmd.Stdout = audit.Writer().Prefix(prefix + " ")
   185  	cmd.Stderr = audit.Writer().Prefix(prefix + "-stderr ")
   186  
   187  	cmd.Env = append(os.Environ(), config.AgentConnEnvVar+"="+cfg.AgentConnection)
   188  	cmd.Env = append(cmd.Env, config.PluginMountpointEnvVar+"="+cfg.ClientMount)
   189  
   190  	if err := cmd.Start(); err != nil {
   191  		return errors.Wrapf(err, "cmd failed %q", cmd)
   192  	}
   193  
   194  	audit.Logf("Started %s (PID: %d)", cmd.Path, cmd.Process.Pid)
   195  	m.processChan <- &pluginProcess{cfg, cmd}
   196  
   197  	return nil
   198  }