github.com/whamcloud/lemur@v0.0.0-20190827193804-4655df8a52af/cmd/lhsmd/agent/plugin.go (about) 1 // Copyright (c) 2018 DDN. All rights reserved. 2 // Use of this source code is governed by a MIT-style 3 // license that can be found in the LICENSE file. 4 5 package agent 6 7 import ( 8 "bytes" 9 "encoding/json" 10 "os" 11 "os/exec" 12 "path" 13 "time" 14 15 "github.com/pkg/errors" 16 17 "golang.org/x/net/context" 18 19 "github.com/intel-hpdd/lemur/cmd/lhsmd/config" 20 "github.com/intel-hpdd/logging/alert" 21 "github.com/intel-hpdd/logging/audit" 22 "github.com/intel-hpdd/logging/debug" 23 ) 24 25 var backoff = []time.Duration{ 26 0 * time.Second, 27 1 * time.Second, 28 10 * time.Second, 29 30 * time.Second, 30 1 * time.Minute, 31 } 32 var maxBackoff = len(backoff) - 1 33 34 type ( 35 // PluginConfig represents configuration for a single plugin 36 PluginConfig struct { 37 Name string 38 BinPath string 39 AgentConnection string 40 ClientMount string 41 Args []string 42 RestartOnFailure bool 43 44 lastRestart time.Time 45 restartCount int 46 } 47 48 // PluginMonitor watches monitored plugins and restarts 49 // them as needed. 50 PluginMonitor struct { 51 processChan ppChan 52 processStateChan psChan 53 } 54 55 pluginProcess struct { 56 plugin *PluginConfig 57 cmd *exec.Cmd 58 } 59 60 pluginStatus struct { 61 ps *os.ProcessState 62 err error 63 } 64 65 ppChan chan *pluginProcess 66 psChan chan *pluginStatus 67 ) 68 69 func (p *PluginConfig) String() string { 70 data, err := json.Marshal(p) 71 if err != nil { 72 alert.Abort(errors.Wrap(err, "marshal failed")) 73 } 74 75 var out bytes.Buffer 76 json.Indent(&out, data, "", "\t") 77 return out.String() 78 } 79 80 // NoRestart optionally sets a plugin to not be restarted on failure 81 func (p *PluginConfig) NoRestart() *PluginConfig { 82 p.RestartOnFailure = false 83 return p 84 } 85 86 // RestartDelay returns a time.Duration to delay restarts based on 87 // the number of restarts and the last restart time. 88 func (p *PluginConfig) RestartDelay() time.Duration { 89 // If it's been a decent amount of time since the last restart, 90 // reset the backoff mechanism for a quick restart. 91 if time.Since(p.lastRestart) > backoff[maxBackoff]*2 { 92 p.restartCount = 0 93 } 94 95 if p.restartCount > maxBackoff { 96 return backoff[maxBackoff] 97 } 98 return backoff[p.restartCount] 99 } 100 101 // NewPlugin returns a plugin configuration 102 func NewPlugin(name, binPath, conn, mountRoot string, args ...string) *PluginConfig { 103 return &PluginConfig{ 104 Name: name, 105 BinPath: binPath, 106 AgentConnection: conn, 107 ClientMount: path.Join(mountRoot, name), 108 Args: args, 109 RestartOnFailure: true, 110 } 111 } 112 113 // NewMonitor creates a new plugin monitor 114 func NewMonitor() *PluginMonitor { 115 return &PluginMonitor{ 116 processChan: make(ppChan), 117 processStateChan: make(psChan), 118 } 119 } 120 121 func (m *PluginMonitor) run(ctx context.Context) { 122 processMap := make(map[int]*PluginConfig) 123 124 var waitForCmd = func(cmd *exec.Cmd) { 125 debug.Printf("Waiting for %s (%d) to exit", cmd.Path, cmd.Process.Pid) 126 ps, err := cmd.Process.Wait() 127 if err != nil { 128 audit.Logf("Err after Wait() for %d: %s", cmd.Process.Pid, err) 129 } 130 131 debug.Printf("PID %d finished: %s", cmd.Process.Pid, ps) 132 m.processStateChan <- &pluginStatus{ps, err} 133 } 134 135 for { 136 select { 137 case p := <-m.processChan: 138 processMap[p.cmd.Process.Pid] = p.plugin 139 go waitForCmd(p.cmd) 140 case s := <-m.processStateChan: 141 cfg, found := processMap[s.ps.Pid()] 142 if !found { 143 debug.Printf("Received disp of unknown pid: %d", s.ps.Pid()) 144 break 145 } 146 147 delete(processMap, s.ps.Pid()) 148 audit.Logf("Process %d for %s died: %s", s.ps.Pid(), cfg.Name, s.ps) 149 if cfg.RestartOnFailure { 150 delay := cfg.RestartDelay() 151 audit.Logf("Restarting plugin %s after delay of %s (attempt %d)", cfg.Name, delay, cfg.restartCount) 152 153 cfg.restartCount++ 154 cfg.lastRestart = time.Now() 155 // Restart in a different goroutine to 156 // avoid deadlocking this one. 157 go func(cfg *PluginConfig, delay time.Duration) { 158 <-time.After(delay) 159 160 err := m.StartPlugin(cfg) 161 if err != nil { 162 audit.Logf("Failed to restart plugin %s: %s", cfg.Name, err) 163 } 164 }(cfg, delay) 165 } 166 case <-ctx.Done(): 167 return 168 } 169 } 170 } 171 172 // Start creates a new plugin monitor 173 func (m *PluginMonitor) Start(ctx context.Context) { 174 go m.run(ctx) 175 } 176 177 // StartPlugin starts the plugin and monitors it 178 func (m *PluginMonitor) StartPlugin(cfg *PluginConfig) error { 179 debug.Printf("Starting %s for %s", cfg.BinPath, cfg.Name) 180 181 cmd := exec.Command(cfg.BinPath, cfg.Args...) // #nosec 182 183 prefix := path.Base(cfg.BinPath) 184 cmd.Stdout = audit.Writer().Prefix(prefix + " ") 185 cmd.Stderr = audit.Writer().Prefix(prefix + "-stderr ") 186 187 cmd.Env = append(os.Environ(), config.AgentConnEnvVar+"="+cfg.AgentConnection) 188 cmd.Env = append(cmd.Env, config.PluginMountpointEnvVar+"="+cfg.ClientMount) 189 190 if err := cmd.Start(); err != nil { 191 return errors.Wrapf(err, "cmd failed %q", cmd) 192 } 193 194 audit.Logf("Started %s (PID: %d)", cmd.Path, cmd.Process.Pid) 195 m.processChan <- &pluginProcess{cfg, cmd} 196 197 return nil 198 }