github.com/nycdavid/zeus@v0.0.0-20201208104106-9ba439429e03/go/processtree/slavenode.go (about) 1 package processtree 2 3 import ( 4 "bufio" 5 "math/rand" 6 "os" 7 "os/exec" 8 "strconv" 9 "strings" 10 "sync" 11 "syscall" 12 "time" 13 14 "fmt" 15 "runtime" 16 17 "github.com/burke/zeus/go/filemonitor" 18 "github.com/burke/zeus/go/messages" 19 slog "github.com/burke/zeus/go/shinylog" 20 "github.com/burke/zeus/go/unixsocket" 21 ) 22 23 const ( 24 forceKillTimeout = time.Second 25 ) 26 27 type SlaveNode struct { 28 ProcessTreeNode 29 socket *unixsocket.Usock 30 pid int 31 Error string 32 Slaves []*SlaveNode 33 Commands []*CommandNode 34 fileMonitor filemonitor.FileMonitor 35 36 hasSuccessfullyBooted bool 37 38 needsRestart chan bool 39 commandBootRequests chan *CommandRequest 40 slaveBootRequests chan *SlaveNode 41 42 L sync.Mutex 43 features map[string]bool 44 featureL sync.Mutex 45 state string 46 47 event chan bool 48 } 49 50 type CommandReply struct { 51 State string 52 File *os.File 53 } 54 55 type CommandRequest struct { 56 Name string 57 Retchan chan *CommandReply 58 } 59 60 const ( 61 SUnbooted = "U" 62 SBooting = "B" 63 SReady = "R" 64 SCrashed = "C" 65 ) 66 67 var humanreadableStates = map[string]string{ 68 SUnbooted: "unbooted", 69 SBooting: "booted", 70 SReady: "ready", 71 SCrashed: "crashed", 72 } 73 74 func (tree *ProcessTree) NewSlaveNode(identifier string, parent *SlaveNode, monitor filemonitor.FileMonitor) *SlaveNode { 75 s := SlaveNode{} 76 s.needsRestart = make(chan bool, 1) 77 s.slaveBootRequests = make(chan *SlaveNode, 256) 78 s.commandBootRequests = make(chan *CommandRequest, 256) 79 s.features = make(map[string]bool) 80 s.event = make(chan bool) 81 s.Name = identifier 82 s.Parent = parent 83 s.fileMonitor = monitor 84 tree.SlavesByName[identifier] = &s 85 return &s 86 } 87 88 func (s *SlaveNode) RequestRestart() { 89 s.L.Lock() 90 defer s.L.Unlock() 91 92 // If this slave is currently waiting on a process to boot, 93 // unhang it and force it to transition to the crashed state 94 // where it will wait for restart messages. 95 if s.ReportBootEvent() { 96 s.Error = "Received restart request while booting" 97 } 98 99 // Enqueue the restart if there isn't already one in the channel 100 select { 101 case s.needsRestart <- true: 102 default: 103 } 104 } 105 106 func (s *SlaveNode) RequestSlaveBoot(slave *SlaveNode) { 107 s.slaveBootRequests <- slave 108 } 109 110 func (s *SlaveNode) RequestCommandBoot(request *CommandRequest) { 111 s.commandBootRequests <- request 112 } 113 114 func (s *SlaveNode) ReportBootEvent() bool { 115 select { 116 case s.event <- true: 117 return true 118 default: 119 return false 120 } 121 } 122 123 func (s *SlaveNode) SlaveWasInitialized(pid, parentPid int, usock *unixsocket.Usock, featurePipeFd int) { 124 file := os.NewFile(uintptr(featurePipeFd), "featurepipe") 125 126 s.L.Lock() 127 if !s.ReportBootEvent() { 128 s.forceKillPid(pid) 129 s.trace("Unexpected process %d with parent %d for slave %q was killed", pid, parentPid, s.Name) 130 } else { 131 s.wipe() 132 s.pid = pid 133 s.socket = usock 134 go s.handleMessages(file) 135 s.trace("initialized slave %s with pid %d from parent %d", s.Name, pid, parentPid) 136 } 137 s.L.Unlock() 138 } 139 140 func (s *SlaveNode) Run(monitor *SlaveMonitor) { 141 nextState := SUnbooted 142 for { 143 s.L.Lock() 144 s.state = nextState 145 s.L.Unlock() 146 monitor.tree.StateChanged <- true 147 switch nextState { 148 case SUnbooted: 149 s.trace("entering state SUnbooted") 150 nextState = s.doUnbootedState(monitor) 151 case SBooting: 152 s.trace("entering state SBooting") 153 nextState = s.doBootingState() 154 case SReady: 155 s.trace("entering state SReady") 156 nextState = s.doReadyState() 157 case SCrashed: 158 s.trace("entering state SCrashed") 159 nextState = s.doCrashedState() 160 default: 161 slog.FatalErrorString("Unrecognized state: " + nextState) 162 } 163 } 164 } 165 166 func (s *SlaveNode) State() string { 167 s.L.Lock() 168 defer s.L.Unlock() 169 170 return s.state 171 } 172 173 func (s *SlaveNode) HumanReadableState() string { 174 return humanreadableStates[s.state] 175 } 176 177 func (s *SlaveNode) HasFeature(file string) bool { 178 s.featureL.Lock() 179 defer s.featureL.Unlock() 180 return s.features[file] 181 } 182 183 // These "doXState" functions are called when a SlaveNode enters a state. They are expected 184 // to continue to execute until 185 186 // "SUnbooted" represents the state where we do not yet have the PID 187 // of a process to use for *this* node. In this state, we wait for the 188 // parent process to spawn a process for us and hear back from the 189 // SlaveMonitor. 190 func (s *SlaveNode) doUnbootedState(monitor *SlaveMonitor) string { // -> {SBooting, SCrashed} 191 if s.Parent == nil { 192 s.L.Lock() 193 parts := strings.Split(monitor.tree.ExecCommand, " ") 194 cmd := exec.Command(parts[0], parts[1:]...) 195 file := monitor.remoteMasterFile 196 cmd.Env = append(os.Environ(), fmt.Sprintf("ZEUS_MASTER_FD=%d", file.Fd())) 197 cmd.ExtraFiles = []*os.File{file} 198 go s.babysitRootProcess(cmd) 199 s.L.Unlock() 200 } else { 201 s.Parent.RequestSlaveBoot(s) 202 } 203 204 <-s.event // sent by SlaveWasInitialized 205 206 s.L.Lock() 207 defer s.L.Unlock() 208 if s.Error != "" { 209 return SCrashed 210 } 211 return SBooting 212 } 213 214 // In "SBooting", we have a pid and socket to the process we will use, 215 // but it has not yet finished initializing (generally, running the code 216 // specific to this slave). When we receive a message about the success or 217 // failure of this operation, we transition to either crashed or ready. 218 func (s *SlaveNode) doBootingState() string { // -> {SCrashed, SReady} 219 // The slave will execute its action and respond with a status... 220 // Note we don't hold the mutex while waiting for the action to execute. 221 msg, err := s.socket.ReadMessage() 222 if err != nil { 223 s.L.Lock() 224 defer s.L.Unlock() 225 s.Error = err.Error() 226 slog.ErrorString("[" + s.Name + "] " + err.Error()) 227 228 return SCrashed 229 } 230 231 s.trace("received action message") 232 s.L.Lock() 233 defer s.L.Unlock() 234 235 msg, err = messages.ParseActionResponseMessage(msg) 236 if err != nil { 237 slog.ErrorString("[" + s.Name + "] " + err.Error()) 238 } 239 if msg == "OK" { 240 return SReady 241 } 242 243 // Clean up: 244 if s.pid > 0 { 245 syscall.Kill(s.pid, syscall.SIGKILL) 246 } 247 s.wipe() 248 s.Error = msg 249 return SCrashed 250 } 251 252 // In the "SReady" state, we have a functioning process we can spawn 253 // new processes of of. We respond to requests to boot slaves and 254 // run commands until we receive a request to restart. This kills 255 // the process and transitions to SUnbooted. 256 func (s *SlaveNode) doReadyState() string { // -> SUnbooted 257 s.hasSuccessfullyBooted = true 258 259 // If we have a queued restart, service that rather than booting 260 // slaves or commands on potentially stale code. 261 select { 262 case <-s.needsRestart: 263 s.doRestart() 264 return SUnbooted 265 default: 266 } 267 268 for { 269 select { 270 case <-s.needsRestart: 271 s.doRestart() 272 return SUnbooted 273 case slave := <-s.slaveBootRequests: 274 s.bootSlave(slave) 275 case request := <-s.commandBootRequests: 276 s.bootCommand(request) 277 } 278 } 279 } 280 281 // In the "SCrashed" state, we have an error message from starting 282 // a process to propogate to the user and all slave nodes. We will 283 // continue propogating the error until we receive a request to restart. 284 func (s *SlaveNode) doCrashedState() string { // -> SUnbooted 285 // If we have a queued restart, service that rather than booting 286 // slaves or commands on potentially stale code. 287 select { 288 case <-s.needsRestart: 289 s.doRestart() 290 return SUnbooted 291 default: 292 } 293 294 for { 295 select { 296 case <-s.needsRestart: 297 s.doRestart() 298 return SUnbooted 299 case slave := <-s.slaveBootRequests: 300 slave.L.Lock() 301 slave.Error = s.Error 302 slave.ReportBootEvent() 303 slave.L.Unlock() 304 case request := <-s.commandBootRequests: 305 s.L.Lock() 306 s.trace("reporting crash to command %v", request) 307 request.Retchan <- &CommandReply{SCrashed, nil} 308 s.L.Unlock() 309 } 310 } 311 } 312 313 func (s *SlaveNode) doRestart() { 314 s.L.Lock() 315 s.ForceKill() 316 s.wipe() 317 s.L.Unlock() 318 319 // Drain and ignore any enqueued slave boot requests since 320 // we're going to make them all restart again anyway. 321 drained := false 322 for !drained { 323 select { 324 case <-s.slaveBootRequests: 325 default: 326 drained = true 327 } 328 } 329 330 for _, slave := range s.Slaves { 331 slave.RequestRestart() 332 } 333 } 334 335 func (s *SlaveNode) bootSlave(slave *SlaveNode) { 336 s.L.Lock() 337 defer s.L.Unlock() 338 339 s.trace("now sending slave boot request for %s", slave.Name) 340 341 msg := messages.CreateSpawnSlaveMessage(slave.Name) 342 _, err := s.socket.WriteMessage(msg) 343 if err != nil { 344 slog.Error(err) 345 } 346 } 347 348 // This unfortunately holds the mutex for a little while, and if the 349 // command dies super early, the entire slave pretty well deadlocks. 350 // TODO: review this. 351 func (s *SlaveNode) bootCommand(request *CommandRequest) { 352 s.L.Lock() 353 defer s.L.Unlock() 354 355 s.trace("now sending command boot request %v", request) 356 357 identifier := request.Name 358 msg := messages.CreateSpawnCommandMessage(identifier) 359 _, err := s.socket.WriteMessage(msg) 360 if err != nil { 361 slog.Error(err) 362 return 363 } 364 commandFD, err := s.socket.ReadFD() 365 if err != nil { 366 fmt.Println(s.socket) 367 slog.Error(err) 368 return 369 } 370 fileName := strconv.Itoa(rand.Int()) 371 commandFile := os.NewFile(uintptr(commandFD), fileName) 372 request.Retchan <- &CommandReply{s.state, commandFile} 373 } 374 375 func (s *SlaveNode) ForceKill() { 376 // note that we don't try to lock the mutex. 377 s.forceKillPid(s.pid) 378 } 379 380 func (s *SlaveNode) wipe() { 381 s.pid = 0 382 s.socket = nil 383 s.Error = "" 384 } 385 386 func (s *SlaveNode) babysitRootProcess(cmd *exec.Cmd) { 387 // We want to let this process run "forever", but it will eventually 388 // die... either on program termination or when its dependencies change 389 // and we kill it. when it's requested to restart, err is "signal 9", 390 // and we do nothing. 391 s.trace("running the root command now") 392 output, err := cmd.CombinedOutput() 393 if err == nil { 394 // TODO 395 s.trace("root process exited; output was: %s", output) 396 println(string(output)) 397 /* ErrorConfigCommandCrashed(string(output)) */ 398 } 399 msg := err.Error() 400 if s.hasSuccessfullyBooted == false { 401 // TODO 402 s.trace("root process exited with an error before it could boot: %s; output was: %s", msg, output) 403 println(msg) 404 /* ErrorConfigCommandCouldntStart(msg, string(output)) */ 405 } else if msg == "signal 9" { 406 s.trace("root process exited because we killed it & it will be restarted: %s; output was: %s", msg, output) 407 } else { 408 s.L.Lock() 409 defer s.L.Unlock() 410 411 s.trace("root process exited with error. Sending it to crashed state. Message was: %s; output: %s", msg, output) 412 s.Error = fmt.Sprintf("Zeus root process (%s) died with message %s:\n%s", s.Name, msg, output) 413 if !s.ReportBootEvent() { 414 s.trace("Unexpected state for root process to be in at this time: %s", s.state) 415 } 416 } 417 } 418 419 // We want to make this the single interface point with the socket. 420 // we want to republish unneeded messages to channels so other modules 421 //can pick them up. (notably, clienthandler.) 422 func (s *SlaveNode) handleMessages(featurePipe *os.File) { 423 reader := bufio.NewReader(featurePipe) 424 for { 425 if msg, err := reader.ReadString('\n'); err != nil { 426 return 427 } else { 428 msg = strings.TrimRight(msg, "\n") 429 s.featureL.Lock() 430 s.features[msg] = true 431 s.featureL.Unlock() 432 s.fileMonitor.Add(msg) 433 } 434 } 435 } 436 437 func (s *SlaveNode) forceKillPid(pid int) error { 438 if pid <= 0 { 439 return nil 440 } 441 442 if err := syscall.Kill(pid, syscall.SIGTERM); err != nil { 443 err = fmt.Errorf("Error killing pid %q: %v", pid, err) 444 s.trace(err.Error()) 445 return err 446 } 447 448 exited := make(chan error) 449 go func() { 450 for { 451 if err := syscall.Kill(pid, syscall.Signal(0)); err != nil { 452 exited <- nil 453 return 454 } 455 456 // Since the process is not our direct child, we can't use wait 457 // and are forced to poll for completion. We know this won't loop 458 // forever because the timeout below will SIGKILL the process 459 // which guarantees that it'll go away and we'll get an ESRCH. 460 time.Sleep(time.Millisecond) 461 } 462 }() 463 464 select { 465 case err := <-exited: 466 if err != nil && err != syscall.ESRCH { 467 err = fmt.Errorf("Error sending signal to pid %q: %v", pid, err) 468 s.trace(err.Error()) 469 return err 470 } 471 return nil 472 case <-time.After(forceKillTimeout): 473 syscall.Kill(pid, syscall.SIGKILL) 474 return nil 475 } 476 } 477 478 func (s *SlaveNode) trace(format string, args ...interface{}) { 479 if !slog.TraceEnabled() { 480 return 481 } 482 483 _, file, line, _ := runtime.Caller(1) 484 485 var prefix string 486 if s.pid != 0 { 487 prefix = fmt.Sprintf("[%s:%d] %s/(%d)", file, line, s.Name, s.pid) 488 } else { 489 prefix = fmt.Sprintf("[%s:%d] %s/(no PID)", file, line, s.Name) 490 } 491 new_args := make([]interface{}, len(args)+1) 492 new_args[0] = prefix 493 for i, v := range args { 494 new_args[i+1] = v 495 } 496 slog.Trace("%s "+format, new_args...) 497 }