github.com/outbrain/consul@v1.4.5/agent/proxyprocess/daemon.go (about) 1 package proxyprocess 2 3 import ( 4 "fmt" 5 "log" 6 "os" 7 "os/exec" 8 "reflect" 9 "strconv" 10 "sync" 11 "time" 12 13 "github.com/hashicorp/consul/lib/file" 14 "github.com/mitchellh/mapstructure" 15 ) 16 17 // Constants related to restart timers with the daemon mode proxies. At some 18 // point we will probably want to expose these knobs to an end user, but 19 // reasonable defaults are chosen. 20 const ( 21 DaemonRestartHealthy = 10 * time.Second // time before considering healthy 22 DaemonRestartBackoffMin = 3 // 3 attempts before backing off 23 DaemonRestartMaxWait = 1 * time.Minute // maximum backoff wait time 24 ) 25 26 // Daemon is a long-running proxy process. It is expected to keep running 27 // and to use blocking queries to detect changes in configuration, certs, 28 // and more. 29 // 30 // Consul will ensure that if the daemon crashes, that it is restarted. 31 type Daemon struct { 32 // Command is the command to execute to start this daemon. This must 33 // be a Cmd that isn't yet started. 34 Command *exec.Cmd 35 36 // ProxyID is the ID of the proxy service. This is required for API 37 // requests (along with the token) and is passed via env var. 38 ProxyID string 39 40 // ProxyToken is the special local-only ACL token that allows a proxy 41 // to communicate to the Connect-specific endpoints. 42 ProxyToken string 43 44 // Logger is where logs will be sent around the management of this 45 // daemon. The actual logs for the daemon itself will be sent to 46 // a file. 47 Logger *log.Logger 48 49 // PidPath is the path where a pid file will be created storing the 50 // pid of the active process. If this is empty then a pid-file won't 51 // be created. Under erroneous conditions, the pid file may not be 52 // created but the error will be logged to the Logger. 53 PidPath string 54 55 // For tests, they can set this to change the default duration to wait 56 // for a graceful quit. 57 gracefulWait time.Duration 58 59 // process is the started process 60 lock sync.Mutex 61 stopped bool 62 stopCh chan struct{} 63 exitedCh chan struct{} 64 process *os.Process 65 } 66 67 // Start starts the daemon and keeps it running. 68 // 69 // This function returns after the process is successfully started. 70 func (p *Daemon) Start() error { 71 p.lock.Lock() 72 defer p.lock.Unlock() 73 74 // A stopped proxy cannot be restarted 75 if p.stopped { 76 return fmt.Errorf("stopped") 77 } 78 79 // If we're already running, that is okay 80 if p.process != nil { 81 return nil 82 } 83 84 // Setup our stop channel 85 stopCh := make(chan struct{}) 86 exitedCh := make(chan struct{}) 87 p.stopCh = stopCh 88 p.exitedCh = exitedCh 89 90 // Start the loop. 91 go p.keepAlive(stopCh, exitedCh) 92 93 return nil 94 } 95 96 // keepAlive starts and keeps the configured process alive until it 97 // is stopped via Stop. 98 func (p *Daemon) keepAlive(stopCh <-chan struct{}, exitedCh chan<- struct{}) { 99 defer close(exitedCh) 100 101 p.lock.Lock() 102 process := p.process 103 p.lock.Unlock() 104 105 // attemptsDeadline is the time at which we consider the daemon to have 106 // been alive long enough that we can reset the attempt counter. 107 // 108 // attempts keeps track of the number of restart attempts we've had and 109 // is used to calculate the wait time using an exponential backoff. 110 var attemptsDeadline time.Time 111 var attempts uint32 112 113 // Assume the process is adopted, we reset this when we start a new process 114 // ourselves below and use it to decide on a strategy for waiting. 115 adopted := true 116 117 for { 118 if process == nil { 119 // If we're passed the attempt deadline then reset the attempts 120 if !attemptsDeadline.IsZero() && time.Now().After(attemptsDeadline) { 121 attempts = 0 122 } 123 // Set ourselves a deadline - we have to make it at least this long before 124 // we come around the loop to consider it to have been a "successful" 125 // daemon startup and rest the counter above. Note that if the daemon 126 // fails before this, we reset the deadline to zero below so that backoff 127 // sleeps in the loop don't count as "success" time. 128 attemptsDeadline = time.Now().Add(DaemonRestartHealthy) 129 attempts++ 130 131 // Calculate the exponential backoff and wait if we have to 132 if attempts > DaemonRestartBackoffMin { 133 exponent := (attempts - DaemonRestartBackoffMin) 134 if exponent > 31 { 135 exponent = 31 136 } 137 waitTime := (1 << exponent) * time.Second 138 if waitTime > DaemonRestartMaxWait { 139 waitTime = DaemonRestartMaxWait 140 } 141 142 if waitTime > 0 { 143 // If we are waiting, reset the success deadline so we don't 144 // accidentally interpret backoff sleep as successful runtime. 145 attemptsDeadline = time.Time{} 146 147 p.Logger.Printf( 148 "[WARN] agent/proxy: waiting %s before restarting daemon", 149 waitTime) 150 151 timer := time.NewTimer(waitTime) 152 select { 153 case <-timer.C: 154 // Timer is up, good! 155 156 case <-stopCh: 157 // During our backoff wait, we've been signaled to 158 // quit, so just quit. 159 timer.Stop() 160 return 161 } 162 } 163 } 164 165 p.lock.Lock() 166 167 // If we gracefully stopped then don't restart. 168 if p.stopped { 169 p.lock.Unlock() 170 return 171 } 172 173 // Process isn't started currently. We're restarting. Start it 174 // and save the process if we have it. 175 var err error 176 process, err = p.start() 177 if err == nil { 178 p.process = process 179 adopted = false 180 } 181 p.lock.Unlock() 182 183 if err != nil { 184 p.Logger.Printf("[ERR] agent/proxy: error restarting daemon: %s", err) 185 continue 186 } 187 188 } 189 190 var ps *os.ProcessState 191 var err error 192 193 if adopted { 194 // assign to err outside scope 195 _, err = findProcess(process.Pid) 196 if err == nil { 197 // Process appears to be running still, wait a bit before we poll again. 198 // We want a busy loop, but not too busy. 1 second between detecting a 199 // process death seems reasonable. 200 // 201 // SUBTLETY: we must NOT select on stopCh here since the Stop function 202 // assumes that as soon as this method returns and closes exitedCh, that 203 // the process is no longer running. If we are polling then we don't 204 // know that is true until we've polled again so we have to keep polling 205 // until the process goes away even if we know the Daemon is stopping. 206 time.Sleep(1 * time.Second) 207 208 // Restart the loop, process is still set so we effectively jump back to 209 // the findProcess call above. 210 continue 211 } 212 } else { 213 // Wait for child to exit 214 ps, err = process.Wait() 215 } 216 217 // Process exited somehow. 218 process = nil 219 if err != nil { 220 p.Logger.Printf("[INFO] agent/proxy: daemon exited with error: %s", err) 221 } else if ps != nil && !ps.Exited() { 222 p.Logger.Printf("[INFO] agent/proxy: daemon left running") 223 } else if status, ok := exitStatus(ps); ok { 224 p.Logger.Printf("[INFO] agent/proxy: daemon exited with exit code: %d", status) 225 } 226 } 227 } 228 229 // start starts and returns the process. This will create a copy of the 230 // configured *exec.Command with the modifications documented on Daemon 231 // such as setting the proxy token environmental variable. 232 func (p *Daemon) start() (*os.Process, error) { 233 cmd := *p.Command 234 235 // Add the proxy token to the environment. We first copy the env because it is 236 // a slice and therefore the "copy" above will only copy the slice reference. 237 // We allocate an exactly sized slice. 238 // 239 // Note that anything we add to the Env here is NOT persisted in the snapshot 240 // which only looks at p.Command.Env so it needs to be reconstructible exactly 241 // from data in the snapshot otherwise. 242 cmd.Env = make([]string, len(p.Command.Env), len(p.Command.Env)+2) 243 copy(cmd.Env, p.Command.Env) 244 cmd.Env = append(cmd.Env, 245 fmt.Sprintf("%s=%s", EnvProxyID, p.ProxyID), 246 fmt.Sprintf("%s=%s", EnvProxyToken, p.ProxyToken)) 247 248 // Update the Daemon env 249 250 // Args must always contain a 0 entry which is usually the executed binary. 251 // To be safe and a bit more robust we default this, but only to prevent 252 // a panic below. 253 if len(cmd.Args) == 0 { 254 cmd.Args = []string{cmd.Path} 255 } 256 257 // Perform system-specific setup. In particular, Unix-like systems 258 // shuld set sid so that killing the agent doesn't kill the daemon. 259 configureDaemon(&cmd) 260 261 // Start it 262 p.Logger.Printf("[DEBUG] agent/proxy: starting proxy: %q %#v", cmd.Path, cmd.Args[1:]) 263 if err := cmd.Start(); err != nil { 264 return nil, err 265 } 266 267 // Write the pid file. This might error and that's okay. 268 if p.PidPath != "" { 269 pid := strconv.FormatInt(int64(cmd.Process.Pid), 10) 270 if err := file.WriteAtomic(p.PidPath, []byte(pid)); err != nil { 271 p.Logger.Printf( 272 "[DEBUG] agent/proxy: error writing pid file %q: %s", 273 p.PidPath, err) 274 } 275 } 276 277 return cmd.Process, nil 278 } 279 280 // Stop stops the daemon. 281 // 282 // This will attempt a graceful stop (SIGINT) before force killing the 283 // process (SIGKILL). In either case, the process won't be automatically 284 // restarted unless Start is called again. 285 // 286 // This is safe to call multiple times. If the daemon is already stopped, 287 // then this returns no error. 288 func (p *Daemon) Stop() error { 289 p.lock.Lock() 290 291 // If we're already stopped or never started, then no problem. 292 if p.stopped || p.process == nil { 293 // In the case we never even started, calling Stop makes it so 294 // that we can't ever start in the future, either, so mark this. 295 p.stopped = true 296 p.lock.Unlock() 297 return nil 298 } 299 300 // Note that we've stopped 301 p.stopped = true 302 close(p.stopCh) 303 process := p.process 304 p.lock.Unlock() 305 306 gracefulWait := p.gracefulWait 307 if gracefulWait == 0 { 308 gracefulWait = 5 * time.Second 309 } 310 311 // Defer removing the pid file. Even under error conditions we 312 // delete the pid file since Stop means that the manager is no 313 // longer managing this proxy and therefore nothing else will ever 314 // clean it up. 315 if p.PidPath != "" { 316 defer func() { 317 if err := os.Remove(p.PidPath); err != nil && !os.IsNotExist(err) { 318 p.Logger.Printf( 319 "[DEBUG] agent/proxy: error removing pid file %q: %s", 320 p.PidPath, err) 321 } 322 }() 323 } 324 325 // First, try a graceful stop 326 err := process.Signal(os.Interrupt) 327 if err == nil { 328 select { 329 case <-p.exitedCh: 330 // Success! 331 return nil 332 333 case <-time.After(gracefulWait): 334 // Interrupt didn't work 335 p.Logger.Printf("[DEBUG] agent/proxy: graceful wait of %s passed, "+ 336 "killing", gracefulWait) 337 } 338 } else if isProcessAlreadyFinishedErr(err) { 339 // This can happen due to races between signals and polling. 340 return nil 341 } else { 342 p.Logger.Printf("[DEBUG] agent/proxy: sigint failed, killing: %s", err) 343 } 344 345 // Graceful didn't work (e.g. on windows where SIGINT isn't implemented), 346 // forcibly kill 347 err = process.Kill() 348 if err != nil && isProcessAlreadyFinishedErr(err) { 349 return nil 350 } 351 return err 352 } 353 354 // Close implements Proxy by stopping the run loop but not killing the process. 355 // One Close is called, Stop has no effect. 356 func (p *Daemon) Close() error { 357 p.lock.Lock() 358 defer p.lock.Unlock() 359 360 // If we're already stopped or never started, then no problem. 361 if p.stopped || p.process == nil { 362 p.stopped = true 363 return nil 364 } 365 366 // Note that we've stopped 367 p.stopped = true 368 close(p.stopCh) 369 370 return nil 371 } 372 373 // Equal implements Proxy to check for equality. 374 func (p *Daemon) Equal(raw Proxy) bool { 375 p2, ok := raw.(*Daemon) 376 if !ok { 377 return false 378 } 379 380 // We compare equality on a subset of the command configuration 381 return p.ProxyToken == p2.ProxyToken && 382 p.ProxyID == p2.ProxyID && 383 p.Command.Path == p2.Command.Path && 384 p.Command.Dir == p2.Command.Dir && 385 reflect.DeepEqual(p.Command.Args, p2.Command.Args) && 386 reflect.DeepEqual(p.Command.Env, p2.Command.Env) 387 } 388 389 // MarshalSnapshot implements Proxy 390 func (p *Daemon) MarshalSnapshot() map[string]interface{} { 391 p.lock.Lock() 392 defer p.lock.Unlock() 393 394 // If we're stopped or have no process, then nothing to snapshot. 395 if p.stopped || p.process == nil { 396 return nil 397 } 398 399 return map[string]interface{}{ 400 "Pid": p.process.Pid, 401 "CommandPath": p.Command.Path, 402 "CommandArgs": p.Command.Args, 403 "CommandDir": p.Command.Dir, 404 "CommandEnv": p.Command.Env, 405 "ProxyToken": p.ProxyToken, 406 "ProxyID": p.ProxyID, 407 } 408 } 409 410 // UnmarshalSnapshot implements Proxy 411 func (p *Daemon) UnmarshalSnapshot(m map[string]interface{}) error { 412 var s daemonSnapshot 413 if err := mapstructure.Decode(m, &s); err != nil { 414 return err 415 } 416 417 p.lock.Lock() 418 defer p.lock.Unlock() 419 420 // Set the basic fields 421 p.ProxyToken = s.ProxyToken 422 p.ProxyID = s.ProxyID 423 p.Command = &exec.Cmd{ 424 Path: s.CommandPath, 425 Args: s.CommandArgs, 426 Dir: s.CommandDir, 427 Env: s.CommandEnv, 428 } 429 430 // FindProcess on many systems returns no error even if the process 431 // is now dead. We perform an extra check that the process is alive. 432 proc, err := findProcess(s.Pid) 433 if err != nil { 434 return err 435 } 436 437 // "Start it" 438 stopCh := make(chan struct{}) 439 exitedCh := make(chan struct{}) 440 p.stopCh = stopCh 441 p.exitedCh = exitedCh 442 p.process = proc 443 go p.keepAlive(stopCh, exitedCh) 444 445 return nil 446 } 447 448 // daemonSnapshot is the structure of the marshaled data for snapshotting. 449 // 450 // Note we don't have to store the ProxyId because this is stored directly 451 // within the manager snapshot and is restored automatically. 452 type daemonSnapshot struct { 453 // Pid of the process. This is the only value actually required to 454 // regain management control. The remainder values are for Equal. 455 Pid int 456 457 // Command information 458 CommandPath string 459 CommandArgs []string 460 CommandDir string 461 CommandEnv []string 462 463 // NOTE(mitchellh): longer term there are discussions/plans to only 464 // store the hash of the token but for now we need the full token in 465 // case the process dies and has to be restarted. 466 ProxyToken string 467 468 ProxyID string 469 }