github.com/google/fleetspeak@v0.1.15-0.20240426164851-4f31f62c1aea/fleetspeak/src/client/daemonservice/execution/execution.go (about)

     1  // Copyright 2017 Google Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     https://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package execution provides an abstraction for a single execution of a command
    16  // with the context of daemonservice.
    17  package execution
    18  
    19  import (
    20  	"context"
    21  	"errors"
    22  	"flag"
    23  	"fmt"
    24  	"os"
    25  	"os/exec"
    26  	"sync"
    27  	"sync/atomic"
    28  	"time"
    29  
    30  	log "github.com/golang/glog"
    31  	"golang.org/x/sync/errgroup"
    32  	"google.golang.org/protobuf/proto"
    33  	anypb "google.golang.org/protobuf/types/known/anypb"
    34  	tspb "google.golang.org/protobuf/types/known/timestamppb"
    35  
    36  	"github.com/google/fleetspeak/fleetspeak/src/client/channel"
    37  	"github.com/google/fleetspeak/fleetspeak/src/client/daemonservice/command"
    38  	"github.com/google/fleetspeak/fleetspeak/src/client/internal/monitoring"
    39  	"github.com/google/fleetspeak/fleetspeak/src/client/internal/process"
    40  	"github.com/google/fleetspeak/fleetspeak/src/client/service"
    41  
    42  	fcpb "github.com/google/fleetspeak/fleetspeak/src/client/channel/proto/fleetspeak_channel"
    43  	dspb "github.com/google/fleetspeak/fleetspeak/src/client/daemonservice/proto/fleetspeak_daemonservice"
    44  	"github.com/google/fleetspeak/fleetspeak/src/common/fscontext"
    45  	fspb "github.com/google/fleetspeak/fleetspeak/src/common/proto/fleetspeak"
    46  	mpb "github.com/google/fleetspeak/fleetspeak/src/common/proto/fleetspeak_monitoring"
    47  )
    48  
    49  // We flush the output when either of these thresholds are met.
    50  const (
    51  	maxFlushBytes           = 1 << 20 // 1MB
    52  	defaultFlushTimeSeconds = int32(60)
    53  )
    54  
    55  var (
    56  	// ErrShuttingDown is returned once an execution has started shutting down and
    57  	// is no longer accepting messages.
    58  	ErrShuttingDown = errors.New("shutting down")
    59  
    60  	stdForward = flag.Bool("std_forward", false,
    61  		"If set attaches the dependent service to the client's stdin, stdout, stderr. Meant for testing individual daemonservice integrations.")
    62  )
    63  
    64  // How long to wait for the daemon service to send startup data before
    65  // starting to monitor resource-usage anyway.
    66  var startupDataTimeout = 10 * time.Second
    67  
    68  type atomicString struct {
    69  	v atomic.Value
    70  }
    71  
    72  func (s *atomicString) Set(val string) {
    73  	s.v.Store(val)
    74  }
    75  
    76  func (s *atomicString) Get() string {
    77  	stored := s.v.Load()
    78  	if stored == nil {
    79  		return ""
    80  	}
    81  	return stored.(string)
    82  }
    83  
    84  type atomicBool struct {
    85  	v atomic.Value
    86  }
    87  
    88  func (b *atomicBool) Set(val bool) {
    89  	b.v.Store(val)
    90  }
    91  
    92  func (b *atomicBool) Get() bool {
    93  	stored := b.v.Load()
    94  	if stored == nil {
    95  		return false
    96  	}
    97  	return stored.(bool)
    98  }
    99  
   100  type atomicTime struct {
   101  	v atomic.Value
   102  }
   103  
   104  func (t *atomicTime) Set(val time.Time) {
   105  	t.v.Store(val)
   106  }
   107  
   108  func (t *atomicTime) Get() time.Time {
   109  	stored := t.v.Load()
   110  	if stored == nil {
   111  		return time.Unix(0, 0).UTC()
   112  	}
   113  	return stored.(time.Time)
   114  }
   115  
   116  // An Execution represents a specific execution of a daemonservice.
   117  type Execution struct {
   118  	daemonServiceName string
   119  	memoryLimit       int64
   120  	samplePeriod      time.Duration
   121  	sampleSize        int
   122  
   123  	shuttingDown chan struct{} // Closed when Shutdown has been called.
   124  
   125  	sc        service.Context
   126  	mu        sync.Mutex // protect e.channel
   127  	channel   *channel.Channel
   128  	cmd       *command.Command
   129  	StartTime time.Time
   130  
   131  	outData        *dspb.StdOutputData // The next output data to send, once full.
   132  	lastOut        time.Time           // Time of most recent output of outData.
   133  	outLock        sync.Mutex          // Protects outData, lastOut
   134  	outFlushBytes  int                 // How many bytes trigger an output. Constant.
   135  	outServiceName string              // The service to send StdOutput messages to. Constant.
   136  
   137  	shutdown   sync.Once
   138  	lastActive int64 // Time of the last message input or output in seconds since epoch (UTC), atomic access only.
   139  
   140  	dead chan struct{} // closed when the underlying process has died.
   141  
   142  	waitForGoroutines func() error
   143  	startupData       chan *fcpb.StartupData // Startup data sent by the daemon process.
   144  
   145  	heartbeat               atomicTime    // Time when the last message was received from the daemon process.
   146  	monitorHeartbeats       bool          // Whether to monitor the daemon process's heartbeats, killing it if it doesn't heartbeat often enough.
   147  	initialHeartbeatTimeout time.Duration // How long to wait for the initial heartbeat.
   148  	heartbeatTimeout        time.Duration // How long to wait for subsequent heartbeats.
   149  	sending                 atomicBool    // Indicates whether a message-send operation is in progress.
   150  	serviceVersion          atomicString  // Version reported by the daemon process.
   151  }
   152  
   153  var errProcessTerminated = errors.New("process terminated")
   154  
   155  // New creates and starts an execution of the command described in cfg. Messages
   156  // received from the resulting process are passed to sc, as are StdOutput and
   157  // ResourceUsage messages.
   158  //
   159  // The context needs to be valid for the lifetime of the whole process execution!
   160  func New(ctx context.Context, daemonServiceName string, cfg *dspb.Config, sc service.Context) (*Execution, error) {
   161  	cfg = proto.Clone(cfg).(*dspb.Config)
   162  
   163  	ret := Execution{
   164  		daemonServiceName: daemonServiceName,
   165  		memoryLimit:       cfg.MemoryLimit,
   166  		sampleSize:        int(cfg.ResourceMonitoringSampleSize),
   167  		samplePeriod:      cfg.ResourceMonitoringSamplePeriod.AsDuration(),
   168  
   169  		shuttingDown: make(chan struct{}),
   170  
   171  		sc:        sc,
   172  		cmd:       &command.Command{Cmd: *exec.Command(cfg.Argv[0], cfg.Argv[1:]...)},
   173  		StartTime: time.Now(),
   174  
   175  		outData: &dspb.StdOutputData{},
   176  		lastOut: time.Now(),
   177  
   178  		dead:        make(chan struct{}),
   179  		startupData: make(chan *fcpb.StartupData, 1),
   180  
   181  		monitorHeartbeats:       cfg.MonitorHeartbeats,
   182  		initialHeartbeatTimeout: cfg.HeartbeatUnresponsiveGracePeriod.AsDuration(),
   183  		heartbeatTimeout:        cfg.HeartbeatUnresponsiveKillPeriod.AsDuration(),
   184  	}
   185  
   186  	if ret.initialHeartbeatTimeout <= 0 {
   187  		ret.initialHeartbeatTimeout = time.Duration(cfg.HeartbeatUnresponsiveGracePeriodSeconds) * time.Second
   188  	}
   189  	if ret.heartbeatTimeout <= 0 {
   190  		ret.heartbeatTimeout = time.Duration(cfg.HeartbeatUnresponsiveKillPeriodSeconds) * time.Second
   191  	}
   192  	if ret.samplePeriod <= 0 {
   193  		ret.samplePeriod = time.Duration(cfg.ResourceMonitoringSamplePeriodSeconds) * time.Second
   194  	}
   195  
   196  	var err error
   197  	ret.channel, err = ret.cmd.SetupCommsChannel()
   198  	if err != nil {
   199  		return nil, fmt.Errorf("failed to setup a comms channel: %v", err)
   200  	}
   201  
   202  	if cfg.StdParams != nil && cfg.StdParams.ServiceName == "" {
   203  		log.Errorf("std_params is set, but service_name is empty. Ignoring std_params: %v", cfg.StdParams)
   204  		cfg.StdParams = nil
   205  	}
   206  
   207  	if *stdForward {
   208  		log.Warningf("std_forward is set, connecting std... to %s", cfg.Argv[0])
   209  		ret.cmd.Stdin = os.Stdin
   210  		ret.cmd.Stdout = os.Stdout
   211  		ret.cmd.Stderr = os.Stderr
   212  	} else if cfg.StdParams != nil {
   213  		if cfg.StdParams.FlushBytes <= 0 || cfg.StdParams.FlushBytes > maxFlushBytes {
   214  			cfg.StdParams.FlushBytes = maxFlushBytes
   215  		}
   216  		if cfg.StdParams.FlushTimeSeconds <= 0 {
   217  			cfg.StdParams.FlushTimeSeconds = defaultFlushTimeSeconds
   218  		}
   219  		ret.outServiceName = cfg.StdParams.ServiceName
   220  		ret.outFlushBytes = int(cfg.StdParams.FlushBytes)
   221  		ret.cmd.Stdout = stdoutWriter{&ret}
   222  		ret.cmd.Stderr = stderrWriter{&ret}
   223  	} else {
   224  		ret.cmd.Stdout = nil
   225  		ret.cmd.Stderr = nil
   226  	}
   227  
   228  	if err := ret.cmd.Start(); err != nil {
   229  		// Close all contained channels before discarding the ret object.
   230  		close(ret.shuttingDown)
   231  		close(ret.dead)
   232  		close(ret.startupData)
   233  		return nil, err
   234  	}
   235  
   236  	errProcessExited := errors.New("process exited")
   237  	ctx, cancel := fscontext.WithDoneChan(ctx, ErrShuttingDown, ret.shuttingDown)
   238  	eg, ctx := errgroup.WithContext(ctx)
   239  	ret.waitForGoroutines = func() error {
   240  		cancel(nil)
   241  
   242  		err := eg.Wait()
   243  
   244  		// These are expected errors during a normal shutdown.
   245  		if errors.Is(err, errInputChannelClosed) || errors.Is(err, ErrShuttingDown) || errors.Is(err, errProcessExited) {
   246  			log.Warningf("during wait: ignoring shutdown error: %v", err)
   247  			return nil
   248  		}
   249  		return err
   250  	}
   251  
   252  	if cfg.StdParams != nil {
   253  		eg.Go(func() error {
   254  			ctx, cancel := fscontext.WithDoneChan(context.TODO(), ErrShuttingDown, ret.dead)
   255  			defer cancel(nil)
   256  			period := time.Second * time.Duration(cfg.StdParams.FlushTimeSeconds)
   257  			return ret.stdFlushRoutine(ctx, period)
   258  		})
   259  	}
   260  	eg.Go(func() (err error) {
   261  		defer ret.Shutdown()
   262  		defer func() { cancel(err) }()
   263  		return ret.inRoutine(ctx)
   264  	})
   265  	if !cfg.DisableResourceMonitoring {
   266  		eg.Go(func() (err error) {
   267  			defer func() { cancel(err) }()
   268  			return ret.statsRoutine(ctx)
   269  		})
   270  	}
   271  	eg.Go(func() (err error) {
   272  		defer ret.Shutdown()
   273  		defer func() { cancel(errProcessExited) }()
   274  		defer func() {
   275  			startTime := tspb.New(ret.StartTime)
   276  			if err := startTime.CheckValid(); err != nil {
   277  				log.Errorf("Failed to convert process start time: %v", err)
   278  			}
   279  			if !cfg.DisableResourceMonitoring {
   280  				rud := &mpb.ResourceUsageData{
   281  					Scope:             ret.daemonServiceName,
   282  					Pid:               int64(ret.cmd.Process.Pid),
   283  					ProcessStartTime:  startTime,
   284  					DataTimestamp:     tspb.Now(),
   285  					ResourceUsage:     &mpb.AggregatedResourceUsage{},
   286  					ProcessTerminated: true,
   287  				}
   288  				// The outer context might already be canceled.
   289  				ctx := context.WithoutCancel(ctx)
   290  				if err := monitoring.SendProtoToServer(ctx, rud, "ResourceUsage", ret.sc); err != nil {
   291  					log.Errorf("Failed to send final resource-usage proto: %v", err)
   292  				}
   293  			}
   294  		}()
   295  		waitResult := ret.cmd.Wait()
   296  		close(ret.dead)
   297  		if waitResult != nil {
   298  			cancel(fmt.Errorf("subprocess exited with error: %v", waitResult))
   299  		}
   300  		return waitResult
   301  	})
   302  	return &ret, nil
   303  }
   304  
   305  // Wait waits for all aspects of this execution to finish. This should happen
   306  // soon after shutdown is called.  Wait must be called to free up resources and
   307  // to reap the child process.
   308  func (e *Execution) Wait() error {
   309  	<-e.shuttingDown
   310  	// Flush out channel, as a reader should do.
   311  	for range e.channel.In {
   312  	}
   313  	e.channel.Wait()
   314  	for range e.channel.Err {
   315  	}
   316  	return e.waitForGoroutines()
   317  }
   318  
   319  // LastActive returns the last time that a message was sent or received, to the
   320  // nearest second.
   321  func (e *Execution) LastActive() time.Time {
   322  	return time.Unix(atomic.LoadInt64(&e.lastActive), 0).UTC()
   323  }
   324  
   325  func (e *Execution) setLastActive(t time.Time) {
   326  	atomic.StoreInt64(&e.lastActive, t.Unix())
   327  }
   328  
   329  func dataSize(o *dspb.StdOutputData) int {
   330  	return len(o.Stdout) + len(o.Stderr)
   331  }
   332  
   333  // flushOut flushes e.outData. It assumes that e.outLock is already held.
   334  func (e *Execution) flushOut() {
   335  	// set lastOut before the blocking call to sc.Send, so the next flush
   336  	// has an accurate sense of how stale the data might be.
   337  	n := time.Now()
   338  	e.lastOut = n
   339  	if dataSize(e.outData) == 0 {
   340  		return
   341  	}
   342  	e.setLastActive(n)
   343  
   344  	e.outData.Pid = int64(e.cmd.Process.Pid)
   345  	d, err := anypb.New(e.outData)
   346  	if err != nil {
   347  		log.Errorf("unable to marshal StdOutputData: %v", err)
   348  	} else {
   349  		e.sc.Send(context.Background(), service.AckMessage{
   350  			M: &fspb.Message{
   351  				Destination: &fspb.Address{ServiceName: e.outServiceName},
   352  				MessageType: "StdOutput",
   353  				Data:        d,
   354  			}})
   355  	}
   356  	e.outData = &dspb.StdOutputData{
   357  		MessageIndex: e.outData.MessageIndex + 1,
   358  	}
   359  }
   360  
   361  func (e *Execution) writeToOut(p []byte, isErr bool) {
   362  	e.outLock.Lock()
   363  	defer e.outLock.Unlock()
   364  
   365  	for {
   366  		currSize := dataSize(e.outData)
   367  		// If it all fits, write it and return.
   368  		if currSize+len(p) <= e.outFlushBytes {
   369  			if isErr {
   370  				e.outData.Stderr = append(e.outData.Stderr, p...)
   371  			} else {
   372  				e.outData.Stdout = append(e.outData.Stdout, p...)
   373  			}
   374  			return
   375  		}
   376  		// Write what does fit, flush, continue.
   377  		toWrite := e.outFlushBytes - currSize
   378  		if isErr {
   379  			e.outData.Stderr = append(e.outData.Stderr, p[:toWrite]...)
   380  		} else {
   381  			e.outData.Stdout = append(e.outData.Stdout, p[:toWrite]...)
   382  		}
   383  		p = p[toWrite:]
   384  		e.flushOut()
   385  	}
   386  }
   387  
   388  type stdoutWriter struct {
   389  	e *Execution
   390  }
   391  
   392  func (w stdoutWriter) Write(p []byte) (int, error) {
   393  	w.e.writeToOut(p, false)
   394  	return len(p), nil
   395  }
   396  
   397  type stderrWriter struct {
   398  	e *Execution
   399  }
   400  
   401  func (w stderrWriter) Write(p []byte) (int, error) {
   402  	w.e.writeToOut(p, true)
   403  	return len(p), nil
   404  }
   405  
   406  // stdFlushRoutine calls e.flushOut() periodically with e.outLock held.
   407  // When ctx is canceled, it does it one last time and returns the context's
   408  // cancelation cause.
   409  func (e *Execution) stdFlushRoutine(ctx context.Context, period time.Duration) error {
   410  	t := time.NewTicker(period)
   411  	defer t.Stop()
   412  	for {
   413  		select {
   414  		case <-t.C:
   415  			e.outLock.Lock()
   416  			if e.lastOut.Add(period).Before(time.Now()) {
   417  				e.flushOut()
   418  			}
   419  			e.outLock.Unlock()
   420  		case <-ctx.Done():
   421  			e.outLock.Lock()
   422  			e.flushOut()
   423  			e.outLock.Unlock()
   424  			return context.Cause(ctx)
   425  		}
   426  	}
   427  }
   428  
   429  func sleepCtx(ctx context.Context, d time.Duration) {
   430  	t := time.NewTimer(d)
   431  	defer t.Stop()
   432  	select {
   433  	case <-ctx.Done():
   434  		return
   435  	case <-t.C:
   436  		return
   437  	}
   438  }
   439  
   440  // Shutdown shuts down this execution.
   441  func (e *Execution) Shutdown() {
   442  	e.shutdown.Do(func() {
   443  		// First we attempt a gentle shutdown. Closing e.shuttingDown tells our user
   444  		// not to give us any more data through SendMsg().
   445  		close(e.shuttingDown)
   446  
   447  		// Closing e.channel.Out will cause channel to close the pipe to the
   448  		// dependent process, which then causes it to clean up nicely.
   449  		e.mu.Lock()
   450  		close(e.channel.Out)
   451  		e.mu.Unlock()
   452  
   453  		// Context bound to the process lifetime
   454  		ctx, cancel := fscontext.WithDoneChan(context.TODO(), errProcessTerminated, e.dead)
   455  		defer cancel(nil)
   456  
   457  		sleepCtx(ctx, time.Second)
   458  		if ctx.Err() != nil {
   459  			return
   460  		}
   461  
   462  		// This pattern is technically racy - the process could end and the process
   463  		// id could be recycled since the end of sleepCtx and before we SoftKill
   464  		// or Kill using the process id.
   465  		//
   466  		// A formally correct way to implement this is to spawn a wrapper process
   467  		// which does not die in response to SIGTERM but forwards the signal to the
   468  		// wrapped process - its child. This would ensure that the process is still
   469  		// around all the way to the SIGKILL.
   470  		if err := e.cmd.SoftKill(); err != nil {
   471  			log.Errorf("SoftKill [%d] returned error: %v", e.cmd.Process.Pid, err)
   472  		}
   473  		sleepCtx(ctx, time.Second)
   474  		if ctx.Err() != nil {
   475  			return
   476  		}
   477  
   478  		if err := e.cmd.Kill(); err != nil {
   479  			log.Errorf("Kill [%d] returned error: %v", e.cmd.Process.Pid, err)
   480  		}
   481  		sleepCtx(ctx, time.Second)
   482  		if ctx.Err() != nil {
   483  			return
   484  		}
   485  
   486  		// It is hard to imagine how we might end up here - maybe the process is
   487  		// somehow stuck in a system call or there is some other OS level weirdness.
   488  		// One possibility is that cmd is a zombie process now.
   489  		log.Errorf("Subprocess [%d] appears to have survived SIGKILL.", e.cmd.Process.Pid)
   490  	})
   491  }
   492  
   493  // inRoutine reads messages from the dependent process and passes them to
   494  // fleetspeak.  It returns with an appropriate error when the input channel is
   495  // closed or has an error.
   496  func (e *Execution) inRoutine(ctx context.Context) error {
   497  	var startupDone bool
   498  
   499  	for {
   500  		m, err := e.readMsg(ctx)
   501  		if err != nil {
   502  			if errors.Is(err, errInputChannelClosed) && ctx.Err() != nil {
   503  				return nil // This is expected during shutdown
   504  			}
   505  			return err
   506  		}
   507  		e.setLastActive(time.Now())
   508  		e.heartbeat.Set(time.Now())
   509  
   510  		if m.Destination != nil && m.Destination.ServiceName == "system" {
   511  			switch m.MessageType {
   512  			case "StartupData":
   513  				if startupDone {
   514  					log.Warning("Received spurious startup message, ignoring.")
   515  					break
   516  				}
   517  				startupDone = true
   518  
   519  				sd := &fcpb.StartupData{}
   520  				if err := m.Data.UnmarshalTo(sd); err != nil {
   521  					log.Warningf("Failed to parse startup data from initial message: %v", err)
   522  				} else {
   523  					if sd.Version != "" {
   524  						e.serviceVersion.Set(sd.Version)
   525  					}
   526  					e.startupData <- sd
   527  				}
   528  				close(e.startupData) // No more values to send through the channel.
   529  			case "Heartbeat":
   530  				// Pass, handled above.
   531  			default:
   532  				log.Warningf("Unknown system message type: %s", m.MessageType)
   533  			}
   534  		} else {
   535  			e.sending.Set(true)
   536  			// sc.Send() buffers the message locally for sending to the Fleetspeak server. It will
   537  			// block if the buffer is full.
   538  			if err := e.sc.Send(context.Background(), service.AckMessage{M: m}); err != nil {
   539  				log.Errorf("error sending message to server: %v", err)
   540  			}
   541  			e.sending.Set(false)
   542  		}
   543  	}
   544  }
   545  
   546  // SendMsg sends m to the execution.
   547  //
   548  // It returns nil on success, ErrShuttingDown if the output channel was already
   549  // closed, and context.Cause(ctx) if ctx was canceled.
   550  func (e *Execution) SendMsg(ctx context.Context, m *fspb.Message) error {
   551  	select {
   552  	case <-e.shuttingDown:
   553  		// already shutting down, the e.channel.Out channel might already be closed
   554  		return ErrShuttingDown
   555  	default:
   556  	}
   557  
   558  	e.mu.Lock()
   559  	defer e.mu.Unlock()
   560  
   561  	select {
   562  	case <-ctx.Done():
   563  		return context.Cause(ctx)
   564  	case e.channel.Out <- m:
   565  		return nil
   566  	}
   567  }
   568  
   569  var errInputChannelClosed = errors.New("input channel closed")
   570  
   571  // readMsg blocks until a message is available from the channel.
   572  // It returns an error if the channel reports an error,
   573  // or errInputChannelClosed if the input channel is closed.
   574  func (e *Execution) readMsg(ctx context.Context) (*fspb.Message, error) {
   575  	select {
   576  	case <-ctx.Done():
   577  		return nil, context.Cause(ctx)
   578  	case m, ok := <-e.channel.In:
   579  		if !ok {
   580  			return nil, errInputChannelClosed
   581  		}
   582  		return m, nil
   583  	case err := <-e.channel.Err:
   584  		return nil, fmt.Errorf("channel produced error: %v", err)
   585  	}
   586  }
   587  
   588  // waitForStartupData waits for startup data from e.startupData
   589  // and returns early on context cancelation.
   590  //
   591  // On error or timeout, it will still return a valid PID.
   592  // The caller may choose to continue anyway.
   593  func (e *Execution) waitForStartupData(ctx context.Context) (pid int, version string, err error) {
   594  	pid = e.cmd.Process.Pid
   595  
   596  	ctx, cancel := context.WithTimeout(ctx, startupDataTimeout)
   597  	defer cancel()
   598  
   599  	select {
   600  	case sd, ok := <-e.startupData:
   601  		if !ok {
   602  			return pid, "", fmt.Errorf("channel closed")
   603  		}
   604  		if int(sd.Pid) != pid {
   605  			log.Warningf("%s's self-reported PID (%d) is different from that of the process launched by Fleetspeak (%d)", e.daemonServiceName, sd.Pid, pid)
   606  			pid = int(sd.Pid)
   607  		}
   608  		version = sd.Version
   609  		return pid, version, nil
   610  	case <-ctx.Done():
   611  		return pid, "", context.Cause(ctx)
   612  	}
   613  }
   614  
   615  // statsRoutine monitors the daemon process's resource usage,
   616  // sending reports to the server at regular intervals.
   617  //
   618  // It runs until there is an error, or until ctx is canceled,
   619  // in which case it returns the cancelation cause.
   620  func (e *Execution) statsRoutine(ctx context.Context) error {
   621  	pid, version, err := e.waitForStartupData(ctx)
   622  	if err != nil {
   623  		// If this was due to an external cancelation, return.
   624  		if ctx.Err() != nil {
   625  			return context.Cause(ctx)
   626  		}
   627  		log.Warningf("did not receive startup data for %v, continuing anyway: %v", e.daemonServiceName, err)
   628  	}
   629  
   630  	var wg sync.WaitGroup
   631  	defer wg.Wait()
   632  	if e.monitorHeartbeats {
   633  		wg.Add(1)
   634  		go func() {
   635  			defer wg.Done()
   636  			e.heartbeatMonitorRoutine(ctx, pid)
   637  		}()
   638  	}
   639  
   640  	rum, err := monitoring.New(e.sc, monitoring.ResourceUsageMonitorParams{
   641  		Scope:            e.daemonServiceName,
   642  		Pid:              pid,
   643  		MemoryLimit:      e.memoryLimit,
   644  		ProcessStartTime: e.StartTime,
   645  		Version:          version,
   646  		MaxSamplePeriod:  e.samplePeriod,
   647  		SampleSize:       e.sampleSize,
   648  	})
   649  	if err != nil {
   650  		return fmt.Errorf("failed to create resource-usage monitor: %w", err)
   651  	}
   652  
   653  	// This blocks until ctx is canceled.
   654  	rum.Run(ctx)
   655  	return context.Cause(ctx)
   656  }
   657  
   658  // busySleep sleeps *roughly* for the given duration, not counting the time when
   659  // the Fleetspeak process is suspended.  It returns early when ctx is canceled.
   660  func busySleep(ctx context.Context, d time.Duration) {
   661  	timer := time.NewTimer(time.Second)
   662  	defer timer.Stop()
   663  
   664  	for d > 0 {
   665  		select {
   666  		case <-timer.C:
   667  			step := min(d, time.Second)
   668  			timer.Reset(step)
   669  			d = d - step
   670  		case <-ctx.Done():
   671  			return
   672  		}
   673  	}
   674  }
   675  
   676  // heartbeatMonitorRoutine monitors the daemon process's heartbeats and kills
   677  // unresponsive processes.
   678  //
   679  // It runs until ctx is canceled.
   680  func (e *Execution) heartbeatMonitorRoutine(ctx context.Context, pid int) {
   681  	// Give the child process some time to start up. During boot it sometimes
   682  	// takes significantly more time than the unresponsive_kill_period to start
   683  	// the child so we disable checking for heartbeats for a while.
   684  	e.heartbeat.Set(time.Now())
   685  	busySleep(ctx, e.initialHeartbeatTimeout)
   686  	if ctx.Err() != nil {
   687  		return
   688  	}
   689  
   690  	for {
   691  		if e.sending.Get() { // Blocked trying to buffer a message for sending to the FS server.
   692  			busySleep(ctx, e.heartbeatTimeout)
   693  			if ctx.Err() != nil {
   694  				return
   695  			}
   696  			continue
   697  		}
   698  		timeSinceLastHB := time.Since(e.heartbeat.Get())
   699  		if timeSinceLastHB > e.heartbeatTimeout {
   700  			// There is a very unlikely race condition if the machine gets suspended
   701  			// for longer than unresponsive_kill_period seconds so we give the client
   702  			// some time to catch up.
   703  			busySleep(ctx, 2*time.Second)
   704  			if ctx.Err() != nil {
   705  				return
   706  			}
   707  
   708  			timeSinceLastHB = time.Since(e.heartbeat.Get())
   709  			if timeSinceLastHB > e.heartbeatTimeout && !e.sending.Get() {
   710  				// We have not received a heartbeat in a while, kill the child.
   711  				log.Warningf("No heartbeat received from %s (pid %d), killing.", e.daemonServiceName, pid)
   712  
   713  				// For consistency with MEMORY_EXCEEDED kills, send a notification before attempting to
   714  				// kill the process.
   715  				startTime := tspb.New(e.StartTime)
   716  				if err := startTime.CheckValid(); err != nil {
   717  					log.Errorf("Failed to convert process start time: %v", err)
   718  					startTime = nil
   719  				}
   720  				kn := &mpb.KillNotification{
   721  					Service:          e.daemonServiceName,
   722  					Pid:              int64(pid),
   723  					ProcessStartTime: startTime,
   724  					KilledWhen:       tspb.Now(),
   725  					Reason:           mpb.KillNotification_HEARTBEAT_FAILURE,
   726  				}
   727  				if version := e.serviceVersion.Get(); version != "" {
   728  					kn.Version = version
   729  				}
   730  				if err := monitoring.SendProtoToServer(ctx, kn, "KillNotification", e.sc); err != nil {
   731  					log.Errorf("Failed to send kill notification to server: %v", err)
   732  				}
   733  
   734  				if err := process.KillByPid(pid); err != nil {
   735  					log.Errorf("Error while killing a process that doesn't heartbeat - %s (pid %d): %v", e.daemonServiceName, pid, err)
   736  					continue // Keep retrying.
   737  				}
   738  				return
   739  			}
   740  		}
   741  		// Sleep until when the next heartbeat is due.
   742  		busySleep(ctx, e.heartbeatTimeout-timeSinceLastHB)
   743  		if ctx.Err() != nil {
   744  			return
   745  		}
   746  	}
   747  }