github.com/google/fleetspeak@v0.1.15-0.20240426164851-4f31f62c1aea/fleetspeak/src/client/daemonservice/execution/execution.go (about) 1 // Copyright 2017 Google Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // https://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package execution provides an abstraction for a single execution of a command 16 // with the context of daemonservice. 17 package execution 18 19 import ( 20 "context" 21 "errors" 22 "flag" 23 "fmt" 24 "os" 25 "os/exec" 26 "sync" 27 "sync/atomic" 28 "time" 29 30 log "github.com/golang/glog" 31 "golang.org/x/sync/errgroup" 32 "google.golang.org/protobuf/proto" 33 anypb "google.golang.org/protobuf/types/known/anypb" 34 tspb "google.golang.org/protobuf/types/known/timestamppb" 35 36 "github.com/google/fleetspeak/fleetspeak/src/client/channel" 37 "github.com/google/fleetspeak/fleetspeak/src/client/daemonservice/command" 38 "github.com/google/fleetspeak/fleetspeak/src/client/internal/monitoring" 39 "github.com/google/fleetspeak/fleetspeak/src/client/internal/process" 40 "github.com/google/fleetspeak/fleetspeak/src/client/service" 41 42 fcpb "github.com/google/fleetspeak/fleetspeak/src/client/channel/proto/fleetspeak_channel" 43 dspb "github.com/google/fleetspeak/fleetspeak/src/client/daemonservice/proto/fleetspeak_daemonservice" 44 "github.com/google/fleetspeak/fleetspeak/src/common/fscontext" 45 fspb "github.com/google/fleetspeak/fleetspeak/src/common/proto/fleetspeak" 46 mpb "github.com/google/fleetspeak/fleetspeak/src/common/proto/fleetspeak_monitoring" 47 ) 48 49 // We flush the output when either of these thresholds are met. 50 const ( 51 maxFlushBytes = 1 << 20 // 1MB 52 defaultFlushTimeSeconds = int32(60) 53 ) 54 55 var ( 56 // ErrShuttingDown is returned once an execution has started shutting down and 57 // is no longer accepting messages. 58 ErrShuttingDown = errors.New("shutting down") 59 60 stdForward = flag.Bool("std_forward", false, 61 "If set attaches the dependent service to the client's stdin, stdout, stderr. Meant for testing individual daemonservice integrations.") 62 ) 63 64 // How long to wait for the daemon service to send startup data before 65 // starting to monitor resource-usage anyway. 66 var startupDataTimeout = 10 * time.Second 67 68 type atomicString struct { 69 v atomic.Value 70 } 71 72 func (s *atomicString) Set(val string) { 73 s.v.Store(val) 74 } 75 76 func (s *atomicString) Get() string { 77 stored := s.v.Load() 78 if stored == nil { 79 return "" 80 } 81 return stored.(string) 82 } 83 84 type atomicBool struct { 85 v atomic.Value 86 } 87 88 func (b *atomicBool) Set(val bool) { 89 b.v.Store(val) 90 } 91 92 func (b *atomicBool) Get() bool { 93 stored := b.v.Load() 94 if stored == nil { 95 return false 96 } 97 return stored.(bool) 98 } 99 100 type atomicTime struct { 101 v atomic.Value 102 } 103 104 func (t *atomicTime) Set(val time.Time) { 105 t.v.Store(val) 106 } 107 108 func (t *atomicTime) Get() time.Time { 109 stored := t.v.Load() 110 if stored == nil { 111 return time.Unix(0, 0).UTC() 112 } 113 return stored.(time.Time) 114 } 115 116 // An Execution represents a specific execution of a daemonservice. 117 type Execution struct { 118 daemonServiceName string 119 memoryLimit int64 120 samplePeriod time.Duration 121 sampleSize int 122 123 shuttingDown chan struct{} // Closed when Shutdown has been called. 124 125 sc service.Context 126 mu sync.Mutex // protect e.channel 127 channel *channel.Channel 128 cmd *command.Command 129 StartTime time.Time 130 131 outData *dspb.StdOutputData // The next output data to send, once full. 132 lastOut time.Time // Time of most recent output of outData. 133 outLock sync.Mutex // Protects outData, lastOut 134 outFlushBytes int // How many bytes trigger an output. Constant. 135 outServiceName string // The service to send StdOutput messages to. Constant. 136 137 shutdown sync.Once 138 lastActive int64 // Time of the last message input or output in seconds since epoch (UTC), atomic access only. 139 140 dead chan struct{} // closed when the underlying process has died. 141 142 waitForGoroutines func() error 143 startupData chan *fcpb.StartupData // Startup data sent by the daemon process. 144 145 heartbeat atomicTime // Time when the last message was received from the daemon process. 146 monitorHeartbeats bool // Whether to monitor the daemon process's heartbeats, killing it if it doesn't heartbeat often enough. 147 initialHeartbeatTimeout time.Duration // How long to wait for the initial heartbeat. 148 heartbeatTimeout time.Duration // How long to wait for subsequent heartbeats. 149 sending atomicBool // Indicates whether a message-send operation is in progress. 150 serviceVersion atomicString // Version reported by the daemon process. 151 } 152 153 var errProcessTerminated = errors.New("process terminated") 154 155 // New creates and starts an execution of the command described in cfg. Messages 156 // received from the resulting process are passed to sc, as are StdOutput and 157 // ResourceUsage messages. 158 // 159 // The context needs to be valid for the lifetime of the whole process execution! 160 func New(ctx context.Context, daemonServiceName string, cfg *dspb.Config, sc service.Context) (*Execution, error) { 161 cfg = proto.Clone(cfg).(*dspb.Config) 162 163 ret := Execution{ 164 daemonServiceName: daemonServiceName, 165 memoryLimit: cfg.MemoryLimit, 166 sampleSize: int(cfg.ResourceMonitoringSampleSize), 167 samplePeriod: cfg.ResourceMonitoringSamplePeriod.AsDuration(), 168 169 shuttingDown: make(chan struct{}), 170 171 sc: sc, 172 cmd: &command.Command{Cmd: *exec.Command(cfg.Argv[0], cfg.Argv[1:]...)}, 173 StartTime: time.Now(), 174 175 outData: &dspb.StdOutputData{}, 176 lastOut: time.Now(), 177 178 dead: make(chan struct{}), 179 startupData: make(chan *fcpb.StartupData, 1), 180 181 monitorHeartbeats: cfg.MonitorHeartbeats, 182 initialHeartbeatTimeout: cfg.HeartbeatUnresponsiveGracePeriod.AsDuration(), 183 heartbeatTimeout: cfg.HeartbeatUnresponsiveKillPeriod.AsDuration(), 184 } 185 186 if ret.initialHeartbeatTimeout <= 0 { 187 ret.initialHeartbeatTimeout = time.Duration(cfg.HeartbeatUnresponsiveGracePeriodSeconds) * time.Second 188 } 189 if ret.heartbeatTimeout <= 0 { 190 ret.heartbeatTimeout = time.Duration(cfg.HeartbeatUnresponsiveKillPeriodSeconds) * time.Second 191 } 192 if ret.samplePeriod <= 0 { 193 ret.samplePeriod = time.Duration(cfg.ResourceMonitoringSamplePeriodSeconds) * time.Second 194 } 195 196 var err error 197 ret.channel, err = ret.cmd.SetupCommsChannel() 198 if err != nil { 199 return nil, fmt.Errorf("failed to setup a comms channel: %v", err) 200 } 201 202 if cfg.StdParams != nil && cfg.StdParams.ServiceName == "" { 203 log.Errorf("std_params is set, but service_name is empty. Ignoring std_params: %v", cfg.StdParams) 204 cfg.StdParams = nil 205 } 206 207 if *stdForward { 208 log.Warningf("std_forward is set, connecting std... to %s", cfg.Argv[0]) 209 ret.cmd.Stdin = os.Stdin 210 ret.cmd.Stdout = os.Stdout 211 ret.cmd.Stderr = os.Stderr 212 } else if cfg.StdParams != nil { 213 if cfg.StdParams.FlushBytes <= 0 || cfg.StdParams.FlushBytes > maxFlushBytes { 214 cfg.StdParams.FlushBytes = maxFlushBytes 215 } 216 if cfg.StdParams.FlushTimeSeconds <= 0 { 217 cfg.StdParams.FlushTimeSeconds = defaultFlushTimeSeconds 218 } 219 ret.outServiceName = cfg.StdParams.ServiceName 220 ret.outFlushBytes = int(cfg.StdParams.FlushBytes) 221 ret.cmd.Stdout = stdoutWriter{&ret} 222 ret.cmd.Stderr = stderrWriter{&ret} 223 } else { 224 ret.cmd.Stdout = nil 225 ret.cmd.Stderr = nil 226 } 227 228 if err := ret.cmd.Start(); err != nil { 229 // Close all contained channels before discarding the ret object. 230 close(ret.shuttingDown) 231 close(ret.dead) 232 close(ret.startupData) 233 return nil, err 234 } 235 236 errProcessExited := errors.New("process exited") 237 ctx, cancel := fscontext.WithDoneChan(ctx, ErrShuttingDown, ret.shuttingDown) 238 eg, ctx := errgroup.WithContext(ctx) 239 ret.waitForGoroutines = func() error { 240 cancel(nil) 241 242 err := eg.Wait() 243 244 // These are expected errors during a normal shutdown. 245 if errors.Is(err, errInputChannelClosed) || errors.Is(err, ErrShuttingDown) || errors.Is(err, errProcessExited) { 246 log.Warningf("during wait: ignoring shutdown error: %v", err) 247 return nil 248 } 249 return err 250 } 251 252 if cfg.StdParams != nil { 253 eg.Go(func() error { 254 ctx, cancel := fscontext.WithDoneChan(context.TODO(), ErrShuttingDown, ret.dead) 255 defer cancel(nil) 256 period := time.Second * time.Duration(cfg.StdParams.FlushTimeSeconds) 257 return ret.stdFlushRoutine(ctx, period) 258 }) 259 } 260 eg.Go(func() (err error) { 261 defer ret.Shutdown() 262 defer func() { cancel(err) }() 263 return ret.inRoutine(ctx) 264 }) 265 if !cfg.DisableResourceMonitoring { 266 eg.Go(func() (err error) { 267 defer func() { cancel(err) }() 268 return ret.statsRoutine(ctx) 269 }) 270 } 271 eg.Go(func() (err error) { 272 defer ret.Shutdown() 273 defer func() { cancel(errProcessExited) }() 274 defer func() { 275 startTime := tspb.New(ret.StartTime) 276 if err := startTime.CheckValid(); err != nil { 277 log.Errorf("Failed to convert process start time: %v", err) 278 } 279 if !cfg.DisableResourceMonitoring { 280 rud := &mpb.ResourceUsageData{ 281 Scope: ret.daemonServiceName, 282 Pid: int64(ret.cmd.Process.Pid), 283 ProcessStartTime: startTime, 284 DataTimestamp: tspb.Now(), 285 ResourceUsage: &mpb.AggregatedResourceUsage{}, 286 ProcessTerminated: true, 287 } 288 // The outer context might already be canceled. 289 ctx := context.WithoutCancel(ctx) 290 if err := monitoring.SendProtoToServer(ctx, rud, "ResourceUsage", ret.sc); err != nil { 291 log.Errorf("Failed to send final resource-usage proto: %v", err) 292 } 293 } 294 }() 295 waitResult := ret.cmd.Wait() 296 close(ret.dead) 297 if waitResult != nil { 298 cancel(fmt.Errorf("subprocess exited with error: %v", waitResult)) 299 } 300 return waitResult 301 }) 302 return &ret, nil 303 } 304 305 // Wait waits for all aspects of this execution to finish. This should happen 306 // soon after shutdown is called. Wait must be called to free up resources and 307 // to reap the child process. 308 func (e *Execution) Wait() error { 309 <-e.shuttingDown 310 // Flush out channel, as a reader should do. 311 for range e.channel.In { 312 } 313 e.channel.Wait() 314 for range e.channel.Err { 315 } 316 return e.waitForGoroutines() 317 } 318 319 // LastActive returns the last time that a message was sent or received, to the 320 // nearest second. 321 func (e *Execution) LastActive() time.Time { 322 return time.Unix(atomic.LoadInt64(&e.lastActive), 0).UTC() 323 } 324 325 func (e *Execution) setLastActive(t time.Time) { 326 atomic.StoreInt64(&e.lastActive, t.Unix()) 327 } 328 329 func dataSize(o *dspb.StdOutputData) int { 330 return len(o.Stdout) + len(o.Stderr) 331 } 332 333 // flushOut flushes e.outData. It assumes that e.outLock is already held. 334 func (e *Execution) flushOut() { 335 // set lastOut before the blocking call to sc.Send, so the next flush 336 // has an accurate sense of how stale the data might be. 337 n := time.Now() 338 e.lastOut = n 339 if dataSize(e.outData) == 0 { 340 return 341 } 342 e.setLastActive(n) 343 344 e.outData.Pid = int64(e.cmd.Process.Pid) 345 d, err := anypb.New(e.outData) 346 if err != nil { 347 log.Errorf("unable to marshal StdOutputData: %v", err) 348 } else { 349 e.sc.Send(context.Background(), service.AckMessage{ 350 M: &fspb.Message{ 351 Destination: &fspb.Address{ServiceName: e.outServiceName}, 352 MessageType: "StdOutput", 353 Data: d, 354 }}) 355 } 356 e.outData = &dspb.StdOutputData{ 357 MessageIndex: e.outData.MessageIndex + 1, 358 } 359 } 360 361 func (e *Execution) writeToOut(p []byte, isErr bool) { 362 e.outLock.Lock() 363 defer e.outLock.Unlock() 364 365 for { 366 currSize := dataSize(e.outData) 367 // If it all fits, write it and return. 368 if currSize+len(p) <= e.outFlushBytes { 369 if isErr { 370 e.outData.Stderr = append(e.outData.Stderr, p...) 371 } else { 372 e.outData.Stdout = append(e.outData.Stdout, p...) 373 } 374 return 375 } 376 // Write what does fit, flush, continue. 377 toWrite := e.outFlushBytes - currSize 378 if isErr { 379 e.outData.Stderr = append(e.outData.Stderr, p[:toWrite]...) 380 } else { 381 e.outData.Stdout = append(e.outData.Stdout, p[:toWrite]...) 382 } 383 p = p[toWrite:] 384 e.flushOut() 385 } 386 } 387 388 type stdoutWriter struct { 389 e *Execution 390 } 391 392 func (w stdoutWriter) Write(p []byte) (int, error) { 393 w.e.writeToOut(p, false) 394 return len(p), nil 395 } 396 397 type stderrWriter struct { 398 e *Execution 399 } 400 401 func (w stderrWriter) Write(p []byte) (int, error) { 402 w.e.writeToOut(p, true) 403 return len(p), nil 404 } 405 406 // stdFlushRoutine calls e.flushOut() periodically with e.outLock held. 407 // When ctx is canceled, it does it one last time and returns the context's 408 // cancelation cause. 409 func (e *Execution) stdFlushRoutine(ctx context.Context, period time.Duration) error { 410 t := time.NewTicker(period) 411 defer t.Stop() 412 for { 413 select { 414 case <-t.C: 415 e.outLock.Lock() 416 if e.lastOut.Add(period).Before(time.Now()) { 417 e.flushOut() 418 } 419 e.outLock.Unlock() 420 case <-ctx.Done(): 421 e.outLock.Lock() 422 e.flushOut() 423 e.outLock.Unlock() 424 return context.Cause(ctx) 425 } 426 } 427 } 428 429 func sleepCtx(ctx context.Context, d time.Duration) { 430 t := time.NewTimer(d) 431 defer t.Stop() 432 select { 433 case <-ctx.Done(): 434 return 435 case <-t.C: 436 return 437 } 438 } 439 440 // Shutdown shuts down this execution. 441 func (e *Execution) Shutdown() { 442 e.shutdown.Do(func() { 443 // First we attempt a gentle shutdown. Closing e.shuttingDown tells our user 444 // not to give us any more data through SendMsg(). 445 close(e.shuttingDown) 446 447 // Closing e.channel.Out will cause channel to close the pipe to the 448 // dependent process, which then causes it to clean up nicely. 449 e.mu.Lock() 450 close(e.channel.Out) 451 e.mu.Unlock() 452 453 // Context bound to the process lifetime 454 ctx, cancel := fscontext.WithDoneChan(context.TODO(), errProcessTerminated, e.dead) 455 defer cancel(nil) 456 457 sleepCtx(ctx, time.Second) 458 if ctx.Err() != nil { 459 return 460 } 461 462 // This pattern is technically racy - the process could end and the process 463 // id could be recycled since the end of sleepCtx and before we SoftKill 464 // or Kill using the process id. 465 // 466 // A formally correct way to implement this is to spawn a wrapper process 467 // which does not die in response to SIGTERM but forwards the signal to the 468 // wrapped process - its child. This would ensure that the process is still 469 // around all the way to the SIGKILL. 470 if err := e.cmd.SoftKill(); err != nil { 471 log.Errorf("SoftKill [%d] returned error: %v", e.cmd.Process.Pid, err) 472 } 473 sleepCtx(ctx, time.Second) 474 if ctx.Err() != nil { 475 return 476 } 477 478 if err := e.cmd.Kill(); err != nil { 479 log.Errorf("Kill [%d] returned error: %v", e.cmd.Process.Pid, err) 480 } 481 sleepCtx(ctx, time.Second) 482 if ctx.Err() != nil { 483 return 484 } 485 486 // It is hard to imagine how we might end up here - maybe the process is 487 // somehow stuck in a system call or there is some other OS level weirdness. 488 // One possibility is that cmd is a zombie process now. 489 log.Errorf("Subprocess [%d] appears to have survived SIGKILL.", e.cmd.Process.Pid) 490 }) 491 } 492 493 // inRoutine reads messages from the dependent process and passes them to 494 // fleetspeak. It returns with an appropriate error when the input channel is 495 // closed or has an error. 496 func (e *Execution) inRoutine(ctx context.Context) error { 497 var startupDone bool 498 499 for { 500 m, err := e.readMsg(ctx) 501 if err != nil { 502 if errors.Is(err, errInputChannelClosed) && ctx.Err() != nil { 503 return nil // This is expected during shutdown 504 } 505 return err 506 } 507 e.setLastActive(time.Now()) 508 e.heartbeat.Set(time.Now()) 509 510 if m.Destination != nil && m.Destination.ServiceName == "system" { 511 switch m.MessageType { 512 case "StartupData": 513 if startupDone { 514 log.Warning("Received spurious startup message, ignoring.") 515 break 516 } 517 startupDone = true 518 519 sd := &fcpb.StartupData{} 520 if err := m.Data.UnmarshalTo(sd); err != nil { 521 log.Warningf("Failed to parse startup data from initial message: %v", err) 522 } else { 523 if sd.Version != "" { 524 e.serviceVersion.Set(sd.Version) 525 } 526 e.startupData <- sd 527 } 528 close(e.startupData) // No more values to send through the channel. 529 case "Heartbeat": 530 // Pass, handled above. 531 default: 532 log.Warningf("Unknown system message type: %s", m.MessageType) 533 } 534 } else { 535 e.sending.Set(true) 536 // sc.Send() buffers the message locally for sending to the Fleetspeak server. It will 537 // block if the buffer is full. 538 if err := e.sc.Send(context.Background(), service.AckMessage{M: m}); err != nil { 539 log.Errorf("error sending message to server: %v", err) 540 } 541 e.sending.Set(false) 542 } 543 } 544 } 545 546 // SendMsg sends m to the execution. 547 // 548 // It returns nil on success, ErrShuttingDown if the output channel was already 549 // closed, and context.Cause(ctx) if ctx was canceled. 550 func (e *Execution) SendMsg(ctx context.Context, m *fspb.Message) error { 551 select { 552 case <-e.shuttingDown: 553 // already shutting down, the e.channel.Out channel might already be closed 554 return ErrShuttingDown 555 default: 556 } 557 558 e.mu.Lock() 559 defer e.mu.Unlock() 560 561 select { 562 case <-ctx.Done(): 563 return context.Cause(ctx) 564 case e.channel.Out <- m: 565 return nil 566 } 567 } 568 569 var errInputChannelClosed = errors.New("input channel closed") 570 571 // readMsg blocks until a message is available from the channel. 572 // It returns an error if the channel reports an error, 573 // or errInputChannelClosed if the input channel is closed. 574 func (e *Execution) readMsg(ctx context.Context) (*fspb.Message, error) { 575 select { 576 case <-ctx.Done(): 577 return nil, context.Cause(ctx) 578 case m, ok := <-e.channel.In: 579 if !ok { 580 return nil, errInputChannelClosed 581 } 582 return m, nil 583 case err := <-e.channel.Err: 584 return nil, fmt.Errorf("channel produced error: %v", err) 585 } 586 } 587 588 // waitForStartupData waits for startup data from e.startupData 589 // and returns early on context cancelation. 590 // 591 // On error or timeout, it will still return a valid PID. 592 // The caller may choose to continue anyway. 593 func (e *Execution) waitForStartupData(ctx context.Context) (pid int, version string, err error) { 594 pid = e.cmd.Process.Pid 595 596 ctx, cancel := context.WithTimeout(ctx, startupDataTimeout) 597 defer cancel() 598 599 select { 600 case sd, ok := <-e.startupData: 601 if !ok { 602 return pid, "", fmt.Errorf("channel closed") 603 } 604 if int(sd.Pid) != pid { 605 log.Warningf("%s's self-reported PID (%d) is different from that of the process launched by Fleetspeak (%d)", e.daemonServiceName, sd.Pid, pid) 606 pid = int(sd.Pid) 607 } 608 version = sd.Version 609 return pid, version, nil 610 case <-ctx.Done(): 611 return pid, "", context.Cause(ctx) 612 } 613 } 614 615 // statsRoutine monitors the daemon process's resource usage, 616 // sending reports to the server at regular intervals. 617 // 618 // It runs until there is an error, or until ctx is canceled, 619 // in which case it returns the cancelation cause. 620 func (e *Execution) statsRoutine(ctx context.Context) error { 621 pid, version, err := e.waitForStartupData(ctx) 622 if err != nil { 623 // If this was due to an external cancelation, return. 624 if ctx.Err() != nil { 625 return context.Cause(ctx) 626 } 627 log.Warningf("did not receive startup data for %v, continuing anyway: %v", e.daemonServiceName, err) 628 } 629 630 var wg sync.WaitGroup 631 defer wg.Wait() 632 if e.monitorHeartbeats { 633 wg.Add(1) 634 go func() { 635 defer wg.Done() 636 e.heartbeatMonitorRoutine(ctx, pid) 637 }() 638 } 639 640 rum, err := monitoring.New(e.sc, monitoring.ResourceUsageMonitorParams{ 641 Scope: e.daemonServiceName, 642 Pid: pid, 643 MemoryLimit: e.memoryLimit, 644 ProcessStartTime: e.StartTime, 645 Version: version, 646 MaxSamplePeriod: e.samplePeriod, 647 SampleSize: e.sampleSize, 648 }) 649 if err != nil { 650 return fmt.Errorf("failed to create resource-usage monitor: %w", err) 651 } 652 653 // This blocks until ctx is canceled. 654 rum.Run(ctx) 655 return context.Cause(ctx) 656 } 657 658 // busySleep sleeps *roughly* for the given duration, not counting the time when 659 // the Fleetspeak process is suspended. It returns early when ctx is canceled. 660 func busySleep(ctx context.Context, d time.Duration) { 661 timer := time.NewTimer(time.Second) 662 defer timer.Stop() 663 664 for d > 0 { 665 select { 666 case <-timer.C: 667 step := min(d, time.Second) 668 timer.Reset(step) 669 d = d - step 670 case <-ctx.Done(): 671 return 672 } 673 } 674 } 675 676 // heartbeatMonitorRoutine monitors the daemon process's heartbeats and kills 677 // unresponsive processes. 678 // 679 // It runs until ctx is canceled. 680 func (e *Execution) heartbeatMonitorRoutine(ctx context.Context, pid int) { 681 // Give the child process some time to start up. During boot it sometimes 682 // takes significantly more time than the unresponsive_kill_period to start 683 // the child so we disable checking for heartbeats for a while. 684 e.heartbeat.Set(time.Now()) 685 busySleep(ctx, e.initialHeartbeatTimeout) 686 if ctx.Err() != nil { 687 return 688 } 689 690 for { 691 if e.sending.Get() { // Blocked trying to buffer a message for sending to the FS server. 692 busySleep(ctx, e.heartbeatTimeout) 693 if ctx.Err() != nil { 694 return 695 } 696 continue 697 } 698 timeSinceLastHB := time.Since(e.heartbeat.Get()) 699 if timeSinceLastHB > e.heartbeatTimeout { 700 // There is a very unlikely race condition if the machine gets suspended 701 // for longer than unresponsive_kill_period seconds so we give the client 702 // some time to catch up. 703 busySleep(ctx, 2*time.Second) 704 if ctx.Err() != nil { 705 return 706 } 707 708 timeSinceLastHB = time.Since(e.heartbeat.Get()) 709 if timeSinceLastHB > e.heartbeatTimeout && !e.sending.Get() { 710 // We have not received a heartbeat in a while, kill the child. 711 log.Warningf("No heartbeat received from %s (pid %d), killing.", e.daemonServiceName, pid) 712 713 // For consistency with MEMORY_EXCEEDED kills, send a notification before attempting to 714 // kill the process. 715 startTime := tspb.New(e.StartTime) 716 if err := startTime.CheckValid(); err != nil { 717 log.Errorf("Failed to convert process start time: %v", err) 718 startTime = nil 719 } 720 kn := &mpb.KillNotification{ 721 Service: e.daemonServiceName, 722 Pid: int64(pid), 723 ProcessStartTime: startTime, 724 KilledWhen: tspb.Now(), 725 Reason: mpb.KillNotification_HEARTBEAT_FAILURE, 726 } 727 if version := e.serviceVersion.Get(); version != "" { 728 kn.Version = version 729 } 730 if err := monitoring.SendProtoToServer(ctx, kn, "KillNotification", e.sc); err != nil { 731 log.Errorf("Failed to send kill notification to server: %v", err) 732 } 733 734 if err := process.KillByPid(pid); err != nil { 735 log.Errorf("Error while killing a process that doesn't heartbeat - %s (pid %d): %v", e.daemonServiceName, pid, err) 736 continue // Keep retrying. 737 } 738 return 739 } 740 } 741 // Sleep until when the next heartbeat is due. 742 busySleep(ctx, e.heartbeatTimeout-timeSinceLastHB) 743 if ctx.Err() != nil { 744 return 745 } 746 } 747 }