github.com/google/fleetspeak@v0.1.15-0.20240426164851-4f31f62c1aea/fleetspeak/src/client/internal/monitoring/resource_usage_monitor.go

github.com/google/fleetspeak@v0.1.15-0.20240426164851-4f31f62c1aea/fleetspeak/src/client/internal/monitoring/resource_usage_monitor.go (about)

     1  // Copyright 2017 Google Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     https://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package monitoring
    16  
    17  import (
    18  	"context"
    19  	"errors"
    20  	"fmt"
    21  	"math"
    22  	"time"
    23  
    24  	anypb "google.golang.org/protobuf/types/known/anypb"
    25  
    26  	log "github.com/golang/glog"
    27  	"google.golang.org/protobuf/proto"
    28  	tspb "google.golang.org/protobuf/types/known/timestamppb"
    29  
    30  	"github.com/google/fleetspeak/fleetspeak/src/client/internal/process"
    31  	"github.com/google/fleetspeak/fleetspeak/src/client/service"
    32  
    33  	fspb "github.com/google/fleetspeak/fleetspeak/src/common/proto/fleetspeak"
    34  	mpb "github.com/google/fleetspeak/fleetspeak/src/common/proto/fleetspeak_monitoring"
    35  )
    36  
    37  const (
    38  	epsilon             float64 = 1e-4
    39  	defaultSampleSize           = 20
    40  	defaultSamplePeriod         = 30 * time.Second
    41  )
    42  
    43  // AggregateResourceUsage is a helper function for aggregating resource-usage data across multiple
    44  // resource-usage queries. It should be called once, in sequence, for each ResourceUsage result.
    45  //
    46  // 'numRUCalls' is the number of resource-usage samples aggregated into one AggregatedResourceUsage
    47  // proto; it is used to compute mean metrics.
    48  // 'aggRU' is only updated if no error is encountered.
    49  //
    50  // We don't get memory usage data from finished commands. The commandFinished
    51  // bool argument makes this function skip memory usage aggregation.
    52  func AggregateResourceUsage(prevRU *ResourceUsage, currRU *ResourceUsage, numRUCalls int, aggRU *mpb.AggregatedResourceUsage, commandFinished bool) error {
    53  	if numRUCalls < 2 {
    54  		return errors.New("number of resource-usage calls should be at least 2 (for rate computation)")
    55  	}
    56  	if aggRU == nil {
    57  		return errors.New("aggregated resource-usage proto should not be nil")
    58  	}
    59  
    60  	if prevRU == nil {
    61  		if !proto.Equal(aggRU, &mpb.AggregatedResourceUsage{}) {
    62  			return fmt.Errorf(
    63  				"previous resource-usage is nil, but aggregated proto already has fields set: %v", aggRU)
    64  		}
    65  		aggRU.MeanResidentMemory = float64(currRU.ResidentMemory) / float64(numRUCalls)
    66  		aggRU.MaxResidentMemory = currRU.ResidentMemory
    67  		aggRU.MeanNumFds = float64(currRU.NumFDs) / float64(numRUCalls)
    68  		aggRU.MaxNumFds = currRU.NumFDs
    69  		return nil
    70  	}
    71  
    72  	if !currRU.Timestamp.After(prevRU.Timestamp) {
    73  		return fmt.Errorf(
    74  			"timestamp for current resource-usage[%v] should be > that of previous resource-usage[%v]",
    75  			currRU.Timestamp, prevRU.Timestamp)
    76  	}
    77  
    78  	if err := aggregateTimeResourceUsage(prevRU, currRU, numRUCalls, aggRU); err != nil {
    79  		return err
    80  	}
    81  
    82  	if commandFinished {
    83  		return nil
    84  	}
    85  
    86  	aggregateMemoryResourceUsage(currRU, numRUCalls, aggRU)
    87  	aggregateNumFDsResourceUsage(currRU, numRUCalls, aggRU)
    88  
    89  	return nil
    90  }
    91  
    92  func aggregateTimeResourceUsage(prevRU *ResourceUsage, currRU *ResourceUsage, numRUCalls int, aggRU *mpb.AggregatedResourceUsage) error {
    93  	if currRU.UserCPUMillis+epsilon < prevRU.UserCPUMillis {
    94  		return fmt.Errorf(
    95  			"cumulative user-mode CPU-usage is not expected to decrease: [%v -> %v]",
    96  			prevRU.UserCPUMillis, currRU.UserCPUMillis)
    97  	}
    98  
    99  	if currRU.SystemCPUMillis+epsilon < prevRU.SystemCPUMillis {
   100  		return fmt.Errorf(
   101  			"cumulative system-mode CPU-usage is not expected to decrease: [%v -> %v]",
   102  			prevRU.SystemCPUMillis, currRU.SystemCPUMillis)
   103  	}
   104  
   105  	elapsedSecs := currRU.Timestamp.Sub(prevRU.Timestamp).Seconds()
   106  	userCPURate := (currRU.UserCPUMillis - prevRU.UserCPUMillis) / elapsedSecs
   107  	systemCPURate := (currRU.SystemCPUMillis - prevRU.SystemCPUMillis) / elapsedSecs
   108  
   109  	// Note that since rates are computed between two consecutive data-points, their
   110  	// average uses a sample size of n - 1, where n is the number of resource-usage queries.
   111  	aggRU.MeanUserCpuRate += userCPURate / float64(numRUCalls-1)
   112  	aggRU.MaxUserCpuRate = math.Max(userCPURate, aggRU.MaxUserCpuRate)
   113  	aggRU.MeanSystemCpuRate += systemCPURate / float64(numRUCalls-1)
   114  	aggRU.MaxSystemCpuRate = math.Max(systemCPURate, aggRU.MaxSystemCpuRate)
   115  
   116  	return nil
   117  }
   118  
   119  func aggregateMemoryResourceUsage(currRU *ResourceUsage, numRUCalls int, aggRU *mpb.AggregatedResourceUsage) {
   120  	// Note that since rates are computed between two consecutive data-points, their
   121  	// average uses a sample size of n - 1, where n is the number of resource-usage queries.
   122  	aggRU.MeanResidentMemory += float64(currRU.ResidentMemory) / float64(numRUCalls)
   123  	if currRU.ResidentMemory > aggRU.MaxResidentMemory {
   124  		aggRU.MaxResidentMemory = currRU.ResidentMemory
   125  	}
   126  }
   127  
   128  func aggregateNumFDsResourceUsage(currRU *ResourceUsage, numRUCalls int, aggRU *mpb.AggregatedResourceUsage) {
   129  	aggRU.MeanNumFds += float64(currRU.NumFDs) / float64(numRUCalls)
   130  	if currRU.NumFDs > aggRU.MaxNumFds {
   131  		aggRU.MaxNumFds = currRU.NumFDs
   132  	}
   133  }
   134  
   135  // AggregateResourceUsageForFinishedCmd computes resource-usage for a finished process, given
   136  // resource-usage before and after the process ran.
   137  func AggregateResourceUsageForFinishedCmd(initialRU, finalRU *ResourceUsage) (*mpb.AggregatedResourceUsage, error) {
   138  	aggRU := mpb.AggregatedResourceUsage{}
   139  	err := AggregateResourceUsage(nil, initialRU, 2, &aggRU, true)
   140  	if err != nil {
   141  		return nil, err
   142  	}
   143  	err = AggregateResourceUsage(initialRU, finalRU, 2, &aggRU, true)
   144  	if err != nil {
   145  		return nil, err
   146  	}
   147  
   148  	// If this field is untouched, we have not aggregated memory resource usage
   149  	// for this process yet. We fill it in with what we have.
   150  	// TODO
   151  	if aggRU.MaxResidentMemory == 0 {
   152  		aggRU.MeanResidentMemory = float64(initialRU.ResidentMemory)
   153  		aggRU.MaxResidentMemory = initialRU.ResidentMemory
   154  	}
   155  
   156  	return &aggRU, nil
   157  }
   158  
   159  // Interface for ResourceUsageFetcher, to facilitate stubbing out of the real fetcher in tests.
   160  type resourceUsageFetcherI interface {
   161  	ResourceUsageForPID(pid int) (*ResourceUsage, error)
   162  	DebugStatusForPID(pid int) (string, error)
   163  }
   164  
   165  // ResourceUsageMonitor computes resource-usage metrics for a process and delivers them periodically
   166  // via a channel.
   167  type ResourceUsageMonitor struct {
   168  	sc service.Context
   169  
   170  	scope             string
   171  	pid               int
   172  	memoryLimit       int64
   173  	version           string
   174  	processStartTime  *tspb.Timestamp
   175  	maxSamplePeriod   time.Duration
   176  	initialSampleSize int
   177  	sampleSize        int
   178  
   179  	ruf      resourceUsageFetcherI
   180  	errChan  chan<- error
   181  	doneChan <-chan struct{}
   182  }
   183  
   184  // ResourceUsageMonitorParams contains parameters that might be set when
   185  // creating a ResourceUsageMonitor.
   186  type ResourceUsageMonitorParams struct {
   187  	// What we are monitoring. Typically a service name, or 'system' for the
   188  	// Fleetspeak client itself.
   189  	Scope string
   190  
   191  	// The version string of the service that we are monitoring, if known.
   192  	Version string
   193  
   194  	// The process id that we are monitoring.
   195  	Pid int
   196  
   197  	// If nonzero, the monitored process should be killed if it exceeds this
   198  	// memory limit, in bytes.
   199  	MemoryLimit int64
   200  
   201  	// The time that the processes was started (if known).
   202  	ProcessStartTime time.Time
   203  
   204  	// The longest time to wait between samples.
   205  	MaxSamplePeriod time.Duration
   206  
   207  	// The number of resource-usage query results that get aggregated into
   208  	// a single resource-usage report sent to Fleetspeak servers.
   209  	SampleSize int
   210  
   211  	// If set, the resource monitor will report errors on this channel. If unset,
   212  	// errors will be logged.
   213  	Err chan<- error
   214  
   215  	// If set, stubs out the actual resource fetching. Meant for use only in unit tests.
   216  	ruf resourceUsageFetcherI
   217  }
   218  
   219  // New returns a new ResourceUsageMonitor.
   220  // Once created, it must be started with Run().
   221  func New(sc service.Context, params ResourceUsageMonitorParams) (*ResourceUsageMonitor, error) {
   222  	var startTimeProto *tspb.Timestamp
   223  
   224  	if !params.ProcessStartTime.IsZero() {
   225  		startTimeProto = tspb.New(params.ProcessStartTime)
   226  		if err := startTimeProto.CheckValid(); err != nil {
   227  			return nil, fmt.Errorf("process start time is invalid: %v", err)
   228  		}
   229  	}
   230  
   231  	if params.SampleSize == 0 {
   232  		params.SampleSize = defaultSampleSize
   233  	}
   234  	if params.MaxSamplePeriod == 0 {
   235  		params.MaxSamplePeriod = defaultSamplePeriod
   236  	}
   237  	if params.SampleSize < 2 {
   238  		return nil, fmt.Errorf("sample size %d invalid - must be at least 2 (for rate computation)", params.SampleSize)
   239  	}
   240  
   241  	maxSamplePeriodSecs := int(params.MaxSamplePeriod / time.Second)
   242  	var backoffSize int
   243  	if maxSamplePeriodSecs == 0 {
   244  		backoffSize = 0
   245  	} else {
   246  		backoffSize = int(math.Log2(float64(maxSamplePeriodSecs)))
   247  	}
   248  	// First sample is bigger because of the backoff.
   249  	initialSampleSize := params.SampleSize + backoffSize
   250  
   251  	if params.ruf == nil {
   252  		params.ruf = ResourceUsageFetcher{}
   253  	}
   254  
   255  	m := ResourceUsageMonitor{
   256  		sc: sc,
   257  
   258  		scope:             params.Scope,
   259  		pid:               params.Pid,
   260  		memoryLimit:       params.MemoryLimit,
   261  		version:           params.Version,
   262  		processStartTime:  startTimeProto,
   263  		maxSamplePeriod:   params.MaxSamplePeriod,
   264  		initialSampleSize: initialSampleSize,
   265  		sampleSize:        params.SampleSize,
   266  
   267  		ruf:     params.ruf,
   268  		errChan: params.Err,
   269  	}
   270  
   271  	return &m, nil
   272  }
   273  
   274  // Run is the business method of the resource-usage monitor.
   275  // It blocks until ctx is canceled.
   276  func (m *ResourceUsageMonitor) Run(ctx context.Context) {
   277  	ctx, cancel := context.WithCancel(ctx)
   278  	defer cancel()
   279  
   280  	// 1s, 2s, 4s, 8s, 16s, ..., m.maxSamplePeriod, m.maxSamplePeriod, m.maxSamplePeriod, ...
   281  	backoffPeriod := min(time.Second, m.maxSamplePeriod)
   282  	backoff := time.NewTimer(backoffPeriod)
   283  	defer backoff.Stop()
   284  	initialSample := true
   285  
   286  	var prevRU *ResourceUsage
   287  	aggRU := mpb.AggregatedResourceUsage{}
   288  	numSamplesCollected := 0
   289  
   290  	resetSamples := func() {
   291  		prevRU = nil
   292  		aggRU = mpb.AggregatedResourceUsage{}
   293  		numSamplesCollected = 0
   294  		initialSample = false
   295  	}
   296  
   297  	for {
   298  		select {
   299  		case <-ctx.Done():
   300  			return
   301  		case <-backoff.C:
   302  			backoffPeriod = min(backoffPeriod*2, m.maxSamplePeriod)
   303  			backoff.Reset(backoffPeriod)
   304  
   305  			currRU, err := m.ruf.ResourceUsageForPID(m.pid)
   306  			if err != nil {
   307  				m.errorf("failed to get resource usage for process[%d]: %v", m.pid, err)
   308  				resetSamples()
   309  				continue
   310  			}
   311  
   312  			if !m.enforceMemoryLimit(ctx, currRU.ResidentMemory) {
   313  				resetSamples()
   314  				continue
   315  			}
   316  
   317  			var ss int
   318  			if initialSample {
   319  				ss = m.initialSampleSize
   320  			} else {
   321  				ss = m.sampleSize
   322  			}
   323  
   324  			err = AggregateResourceUsage(prevRU, currRU, ss, &aggRU, false)
   325  			if err != nil {
   326  				m.errorf("aggregation error: %v", err)
   327  				resetSamples()
   328  				continue
   329  			}
   330  
   331  			prevRU = currRU
   332  			numSamplesCollected++
   333  
   334  			// Sample size reached.
   335  			if numSamplesCollected == ss {
   336  				debugStatus, err := m.ruf.DebugStatusForPID(m.pid)
   337  				if err != nil {
   338  					m.errorf("failed to get debug status for process[%d]: %v", m.pid, err)
   339  				}
   340  				rud := &mpb.ResourceUsageData{
   341  					Scope:            m.scope,
   342  					Pid:              int64(m.pid),
   343  					ProcessStartTime: m.processStartTime,
   344  					Version:          m.version,
   345  					DataTimestamp:    tspb.Now(),
   346  					ResourceUsage:    &aggRU,
   347  					DebugStatus:      debugStatus,
   348  				}
   349  				if err := SendProtoToServer(ctx, rud, "ResourceUsage", m.sc); err != nil {
   350  					m.errorf("failed to send resource-usage data to the server: %v", err)
   351  				}
   352  				resetSamples()
   353  			}
   354  		}
   355  	}
   356  }
   357  
   358  // enforceMemoryLimit kills the monitored process if the given memory usage exceeds the configured limit.
   359  // A boolean is returned, which is true if the memory usage is below the limit.
   360  func (m *ResourceUsageMonitor) enforceMemoryLimit(ctx context.Context, currResidentMemory int64) bool {
   361  	if m.memoryLimit <= 0 || currResidentMemory < m.memoryLimit {
   362  		return true
   363  	}
   364  	// m.scope is the service name here.
   365  	log.Warningf("Memory limit (%d bytes) exceeded for %s; pid %d, killing.", m.memoryLimit, m.scope, m.pid)
   366  
   367  	// Send notification to server before killing the process (which could be the Fleetspeak process).
   368  	kn := &mpb.KillNotification{
   369  		Service:          m.scope,
   370  		Pid:              int64(m.pid),
   371  		Version:          m.version,
   372  		ProcessStartTime: m.processStartTime,
   373  		KilledWhen:       tspb.Now(),
   374  		Reason:           mpb.KillNotification_MEMORY_EXCEEDED,
   375  	}
   376  	if err := SendProtoToServer(ctx, kn, "KillNotification", m.sc); err != nil {
   377  		log.Errorf("Failed to send kill notification to server: %v", err)
   378  	}
   379  
   380  	if err := process.KillByPid(m.pid); err != nil {
   381  		log.Errorf("Error while killing a process that exceeded its memory limit (%d bytes) - %s pid %d: %v", m.memoryLimit, m.scope, m.pid, err)
   382  	}
   383  	return false
   384  }
   385  
   386  func (m *ResourceUsageMonitor) errorf(format string, a ...any) {
   387  	err := fmt.Errorf(format, a...)
   388  	if m.errChan == nil {
   389  		log.Errorf("Resource-usage monitor encountered an error: %v", err)
   390  	} else {
   391  		m.errChan <- err
   392  	}
   393  }
   394  
   395  // SendProtoToServer wraps a proto in a fspb.Message and sends it to the server.
   396  func SendProtoToServer(ctx context.Context, pb proto.Message, msgType string, sc service.Context) error {
   397  	d, err := anypb.New(pb)
   398  	if err != nil {
   399  		return err
   400  	}
   401  	ctx, cancel := context.WithTimeout(ctx, 30*time.Second)
   402  	defer cancel()
   403  	return sc.Send(ctx, service.AckMessage{
   404  		M: &fspb.Message{
   405  			Destination: &fspb.Address{ServiceName: "system"},
   406  			MessageType: msgType,
   407  			Data:        d,
   408  			Priority:    fspb.Message_LOW,
   409  			Background:  true,
   410  		},
   411  	})
   412  }