github.imxd.top/hashicorp/consul@v1.4.5/agent/proxyprocess/manager.go (about) 1 package proxyprocess 2 3 import ( 4 "fmt" 5 "log" 6 "os" 7 "os/exec" 8 "path/filepath" 9 "sync" 10 "time" 11 12 "github.com/hashicorp/consul/agent/local" 13 "github.com/hashicorp/consul/agent/structs" 14 "github.com/hashicorp/go-multierror" 15 ) 16 17 const ( 18 // ManagerCoalescePeriod and ManagerQuiescentPeriod relate to how 19 // notifications in updates from the local state are colaesced to prevent 20 // lots of churn in the manager. 21 // 22 // When the local state updates, the manager will wait for quiescence. 23 // For each update, the quiscence timer is reset. If the coalesce period 24 // is reached, the manager will update proxies regardless of the frequent 25 // changes. Then the whole cycle resets. 26 ManagerCoalescePeriod = 5 * time.Second 27 ManagerQuiescentPeriod = 500 * time.Millisecond 28 29 // ManagerSnapshotPeriod is the interval that snapshots are taken. 30 // The last snapshot state is preserved and if it matches a file isn't 31 // written, so its safe for this to be reasonably frequent. 32 ManagerSnapshotPeriod = 1 * time.Second 33 ) 34 35 // Manager starts, stops, snapshots, and restores managed proxies. 36 // 37 // The manager will not start or stop any processes until Start is called. 38 // Prior to this, any configuration, snapshot loading, etc. can be done. 39 // Even if a process is no longer running after loading the snapshot, it 40 // will not be restarted until Start is called. 41 // 42 // The Manager works by subscribing to change notifications on a local.State 43 // structure. Whenever a change is detected, the Manager syncs its internal 44 // state with the local.State and starts/stops any necessary proxies. The 45 // manager never holds a lock on local.State (except to read the proxies) 46 // and state updates may occur while the Manger is syncing. This is okay, 47 // since a change notification will be queued to trigger another sync. 48 // 49 // The change notifications from the local state are coalesced (see 50 // ManagerCoalescePeriod) so that frequent changes within the local state 51 // do not trigger dozens of proxy resyncs. 52 type Manager struct { 53 // State is the local state that is the source of truth for all 54 // configured managed proxies. 55 State *local.State 56 57 // Logger is the logger for information about manager behavior. 58 // Output for proxies will not go here generally but varies by proxy 59 // implementation type. 60 Logger *log.Logger 61 62 // DataDir is the path to the directory where data for proxies is 63 // written, including snapshots for any state changes in the manager. 64 // Within the data dir, files will be written in the following locatins: 65 // 66 // * logs/ - log files named <service id>-std{out|err}.log 67 // * pids/ - pid files for daemons named <service id>.pid 68 // * snapshot.json - the state of the manager 69 // 70 DataDir string 71 72 // Extra environment variables to set for the proxies 73 ProxyEnv []string 74 75 // SnapshotPeriod is the duration between snapshots. This can be set 76 // relatively low to ensure accuracy, because if the new snapshot matches 77 // the last snapshot taken, no file will be written. Therefore, setting 78 // this low causes only slight CPU/memory usage but doesn't result in 79 // disk IO. If this isn't set, ManagerSnapshotPeriod will be the default. 80 // 81 // This only has an effect if snapshots are enabled (DataDir is set). 82 SnapshotPeriod time.Duration 83 84 // CoalescePeriod and QuiescencePeriod control the timers for coalescing 85 // updates from the local state. See the defaults at the top of this 86 // file for more documentation. These will be set to those defaults 87 // by NewManager. 88 CoalescePeriod time.Duration 89 QuiescentPeriod time.Duration 90 91 // AllowRoot configures whether proxies can be executed as root (EUID == 0). 92 // If this is false then the manager will run and proxies can be added 93 // and removed but none will be started an errors will be logged 94 // to the logger. 95 AllowRoot bool 96 97 // lock is held while reading/writing any internal state of the manager. 98 // cond is a condition variable on lock that is broadcasted for runState 99 // changes. 100 lock *sync.Mutex 101 cond *sync.Cond 102 103 // runState is the current state of the manager. To read this the 104 // lock must be held. The condition variable cond can be waited on 105 // for changes to this value. 106 runState managerRunState 107 108 // lastSnapshot stores a pointer to the last snapshot that successfully 109 // wrote to disk. This is used for dup detection to prevent rewriting 110 // the same snapshot multiple times. snapshots should never be that 111 // large so keeping it in-memory should be cheap even for thousands of 112 // proxies (unlikely scenario). 113 lastSnapshot *snapshot 114 115 proxies map[string]Proxy 116 } 117 118 // NewManager initializes a Manager. After initialization, the exported 119 // fields should be configured as desired. To start the Manager, execute 120 // Run in a goroutine. 121 func NewManager() *Manager { 122 var lock sync.Mutex 123 return &Manager{ 124 Logger: defaultLogger, 125 SnapshotPeriod: ManagerSnapshotPeriod, 126 CoalescePeriod: ManagerCoalescePeriod, 127 QuiescentPeriod: ManagerQuiescentPeriod, 128 lock: &lock, 129 cond: sync.NewCond(&lock), 130 proxies: make(map[string]Proxy), 131 } 132 } 133 134 // defaultLogger is the defaultLogger for NewManager so there it is never nil 135 var defaultLogger = log.New(os.Stderr, "", log.LstdFlags) 136 137 // managerRunState is the state of the Manager. 138 // 139 // This is a basic state machine with the following transitions: 140 // 141 // * idle => running, stopped 142 // * running => stopping, stopped 143 // * stopping => stopped 144 // * stopped => <> 145 // 146 type managerRunState uint8 147 148 const ( 149 managerStateIdle managerRunState = iota 150 managerStateRunning 151 managerStateStopping 152 managerStateStopped 153 ) 154 155 // Close stops the manager. Managed processes are NOT stopped. 156 func (m *Manager) Close() error { 157 m.lock.Lock() 158 defer m.lock.Unlock() 159 160 return m.stop(func(p Proxy) error { 161 return p.Close() 162 }) 163 } 164 165 // Kill will Close the manager and Kill all proxies that were being managed. 166 // Only ONE of Kill or Close must be called. If Close has been called already 167 // then this will have no effect. 168 func (m *Manager) Kill() error { 169 m.lock.Lock() 170 defer m.lock.Unlock() 171 172 return m.stop(func(p Proxy) error { 173 return p.Stop() 174 }) 175 } 176 177 // stop stops the run loop and cleans up all the proxies by calling 178 // the given cleaner. If the cleaner returns an error the proxy won't be 179 // removed from the map. 180 // 181 // The lock must be held while this is called. 182 func (m *Manager) stop(cleaner func(Proxy) error) error { 183 for { 184 // Special case state that exits the for loop 185 if m.runState == managerStateStopped { 186 break 187 } 188 189 switch m.runState { 190 case managerStateIdle: 191 // Idle so just set it to stopped and return. We notify 192 // the condition variable in case others are waiting. 193 m.runState = managerStateStopped 194 m.cond.Broadcast() 195 return nil 196 197 case managerStateRunning: 198 // Set the state to stopping and broadcast to all waiters, 199 // since Run is sitting on cond.Wait. 200 m.runState = managerStateStopping 201 m.cond.Broadcast() 202 m.cond.Wait() // Wait on the stopping event 203 204 case managerStateStopping: 205 // Still stopping, wait... 206 m.cond.Wait() 207 } 208 } 209 210 // Clean up all the proxies 211 var err error 212 for id, proxy := range m.proxies { 213 if err := cleaner(proxy); err != nil { 214 err = multierror.Append( 215 err, fmt.Errorf("failed to stop proxy %q: %s", id, err)) 216 continue 217 } 218 219 // Remove it since it is already stopped successfully 220 delete(m.proxies, id) 221 } 222 223 return err 224 } 225 226 // Run syncs with the local state and supervises existing proxies. 227 // 228 // This blocks and should be run in a goroutine. If another Run is already 229 // executing, this will do nothing and return. 230 func (m *Manager) Run() { 231 m.lock.Lock() 232 if m.runState != managerStateIdle { 233 m.lock.Unlock() 234 return 235 } 236 237 // Set the state to running 238 m.runState = managerStateRunning 239 m.lock.Unlock() 240 241 // Start a goroutine that just waits for a stop request 242 stopCh := make(chan struct{}) 243 go func() { 244 defer close(stopCh) 245 m.lock.Lock() 246 defer m.lock.Unlock() 247 248 // We wait for anything not running, just so we're more resilient 249 // in the face of state machine issues. Basically any state change 250 // will cause us to quit. 251 for m.runState == managerStateRunning { 252 m.cond.Wait() 253 } 254 }() 255 256 // When we exit, we set the state to stopped and broadcast to any 257 // waiting Close functions that they can return. 258 defer func() { 259 m.lock.Lock() 260 m.runState = managerStateStopped 261 m.cond.Broadcast() 262 m.lock.Unlock() 263 }() 264 265 // Register for proxy catalog change notifications 266 notifyCh := make(chan struct{}, 1) 267 m.State.NotifyProxy(notifyCh) 268 defer m.State.StopNotifyProxy(notifyCh) 269 270 // Start the timer for snapshots. We don't use a ticker because disk 271 // IO can be slow and we don't want overlapping notifications. So we only 272 // reset the timer once the snapshot is complete rather than continuously. 273 snapshotTimer := time.NewTimer(m.SnapshotPeriod) 274 defer snapshotTimer.Stop() 275 276 m.Logger.Println("[DEBUG] agent/proxy: managed Connect proxy manager started") 277 SYNC: 278 for { 279 // Sync first, before waiting on further notifications so that 280 // we can start with a known-current state. 281 m.sync() 282 283 // Note for these variables we don't use a time.Timer because both 284 // periods are relatively short anyways so they end up being eligible 285 // for GC very quickly, so overhead is not a concern. 286 var quiescent, quantum <-chan time.Time 287 288 // Start a loop waiting for events from the local state store. This 289 // loops rather than just `select` so we can coalesce many state 290 // updates over a period of time. 291 for { 292 select { 293 case <-notifyCh: 294 // If this is our first notification since the last sync, 295 // reset the quantum timer which is the max time we'll wait. 296 if quantum == nil { 297 quantum = time.After(m.CoalescePeriod) 298 } 299 300 // Always reset the quiescent timer 301 quiescent = time.After(m.QuiescentPeriod) 302 303 case <-quantum: 304 continue SYNC 305 306 case <-quiescent: 307 continue SYNC 308 309 case <-snapshotTimer.C: 310 // Perform a snapshot 311 if path := m.SnapshotPath(); path != "" { 312 if err := m.snapshot(path, true); err != nil { 313 m.Logger.Printf("[WARN] agent/proxy: failed to snapshot state: %s", err) 314 } 315 } 316 317 // Reset 318 snapshotTimer.Reset(m.SnapshotPeriod) 319 320 case <-stopCh: 321 // Stop immediately, no cleanup 322 m.Logger.Println("[DEBUG] agent/proxy: Stopping managed Connect proxy manager") 323 return 324 } 325 } 326 } 327 } 328 329 // sync syncs data with the local state store to update the current manager 330 // state and start/stop necessary proxies. 331 func (m *Manager) sync() { 332 m.lock.Lock() 333 defer m.lock.Unlock() 334 335 // If we don't allow root and we're root, then log a high sev message. 336 if !m.AllowRoot && isRoot() { 337 m.Logger.Println("[WARN] agent/proxy: running as root, will not start managed proxies") 338 return 339 } 340 341 // Get the current set of proxies 342 state := m.State.Proxies() 343 344 // Go through our existing proxies that we're currently managing to 345 // determine if they're still in the state or not. If they're in the 346 // state, we need to diff to determine if we're starting a new proxy 347 // If they're not in the state, then we need to stop the proxy since it 348 // is now orphaned. 349 for id, proxy := range m.proxies { 350 // Get the proxy. 351 stateProxy, ok := state[id] 352 if ok { 353 // Remove the proxy from the state so we don't start it new. 354 delete(state, id) 355 356 // Make the proxy so we can compare. This does not start it. 357 proxy2, err := m.newProxy(stateProxy) 358 if err != nil { 359 m.Logger.Printf("[ERROR] agent/proxy: failed to initialize proxy for %q: %s", id, err) 360 continue 361 } 362 363 // If the proxies are equal, then do nothing 364 if proxy.Equal(proxy2) { 365 continue 366 } 367 368 // Proxies are not equal, so we should stop it. We add it 369 // back to the state here (unlikely case) so the loop below starts 370 // the new one. 371 state[id] = stateProxy 372 373 // Continue out of `if` as if proxy didn't exist so we stop it 374 } 375 376 // Proxy is deregistered. Remove it from our map and stop it 377 delete(m.proxies, id) 378 if err := proxy.Stop(); err != nil { 379 m.Logger.Printf("[ERROR] agent/proxy: failed to stop deregistered proxy for %q: %s", id, err) 380 } 381 } 382 383 // Remaining entries in state are new proxies. Start them! 384 for id, stateProxy := range state { 385 proxy, err := m.newProxy(stateProxy) 386 if err != nil { 387 m.Logger.Printf("[ERROR] agent/proxy: failed to initialize proxy for %q: %s", id, err) 388 continue 389 } 390 391 if err := proxy.Start(); err != nil { 392 m.Logger.Printf("[ERROR] agent/proxy: failed to start proxy for %q: %s", id, err) 393 continue 394 } 395 396 m.proxies[id] = proxy 397 } 398 } 399 400 // newProxy creates the proper Proxy implementation for the configured 401 // local managed proxy. 402 func (m *Manager) newProxy(mp *local.ManagedProxy) (Proxy, error) { 403 // Defensive because the alternative is to panic which is not desired 404 if mp == nil || mp.Proxy == nil { 405 return nil, fmt.Errorf("internal error: nil *local.ManagedProxy or Proxy field") 406 } 407 p := mp.Proxy 408 409 // We reuse the service ID a few times 410 id := p.ProxyService.ID 411 412 // Create the Proxy. We could just as easily switch on p.ExecMode 413 // but I wanted there to be only location where ExecMode => Proxy so 414 // it lowers the chance that is wrong. 415 proxy, err := m.newProxyFromMode(p.ExecMode, id) 416 if err != nil { 417 return nil, err 418 } 419 420 // Depending on the proxy type we configure the rest from our ManagedProxy 421 switch proxy := proxy.(type) { 422 case *Daemon: 423 command := p.Command 424 425 // This should never happen since validation should happen upstream 426 // but verify it because the alternative is to panic below. 427 if len(command) == 0 { 428 return nil, fmt.Errorf("daemon mode managed proxy requires command") 429 } 430 431 // Build the command to execute. 432 var cmd exec.Cmd 433 cmd.Path = command[0] 434 cmd.Args = command // idx 0 is path but preserved since it should be 435 if err := m.configureLogDir(id, &cmd); err != nil { 436 return nil, fmt.Errorf("error configuring proxy logs: %s", err) 437 } 438 439 // Pass in the environmental variables for the proxy process 440 cmd.Env = append(m.ProxyEnv, os.Environ()...) 441 442 // Build the daemon structure 443 proxy.Command = &cmd 444 proxy.ProxyID = id 445 proxy.ProxyToken = mp.ProxyToken 446 return proxy, nil 447 448 default: 449 return nil, fmt.Errorf("unsupported managed proxy type: %q", p.ExecMode) 450 } 451 } 452 453 // newProxyFromMode just initializes the proxy structure from only the mode 454 // and the service ID. This is a shared method between newProxy and Restore 455 // so that we only have one location where we turn ExecMode into a Proxy. 456 func (m *Manager) newProxyFromMode(mode structs.ProxyExecMode, id string) (Proxy, error) { 457 switch mode { 458 case structs.ProxyExecModeDaemon: 459 return &Daemon{ 460 Logger: m.Logger, 461 PidPath: pidPath(filepath.Join(m.DataDir, "pids"), id), 462 }, nil 463 464 default: 465 return nil, fmt.Errorf("unsupported managed proxy type: %q", mode) 466 } 467 } 468 469 // configureLogDir sets up the file descriptors to stdout/stderr so that 470 // they log to the proper file path for the given service ID. 471 func (m *Manager) configureLogDir(id string, cmd *exec.Cmd) error { 472 // Create the log directory 473 logDir := "" 474 if m.DataDir != "" { 475 logDir = filepath.Join(m.DataDir, "logs") 476 if err := os.MkdirAll(logDir, 0700); err != nil { 477 return err 478 } 479 } 480 481 // Configure the stdout, stderr paths 482 stdoutPath := logPath(logDir, id, "stdout") 483 stderrPath := logPath(logDir, id, "stderr") 484 485 // Open the files. We want to append to each. We expect these files 486 // to be rotated by some external process. 487 stdoutF, err := os.OpenFile(stdoutPath, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0600) 488 if err != nil { 489 return fmt.Errorf("error creating stdout file: %s", err) 490 } 491 stderrF, err := os.OpenFile(stderrPath, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0600) 492 if err != nil { 493 // Don't forget to close stdoutF which successfully opened 494 stdoutF.Close() 495 496 return fmt.Errorf("error creating stderr file: %s", err) 497 } 498 499 cmd.Stdout = stdoutF 500 cmd.Stderr = stderrF 501 return nil 502 } 503 504 // logPath is a helper to return the path to the log file for the given 505 // directory, service ID, and stream type (stdout or stderr). 506 func logPath(dir, id, stream string) string { 507 return filepath.Join(dir, fmt.Sprintf("%s-%s.log", id, stream)) 508 } 509 510 // pidPath is a helper to return the path to the pid file for the given 511 // directory and service ID. 512 func pidPath(dir, id string) string { 513 // If no directory is given we do not write a pid 514 if dir == "" { 515 return "" 516 } 517 518 return filepath.Join(dir, fmt.Sprintf("%s.pid", id)) 519 }