github.com/google/fleetspeak@v0.1.15-0.20240426164851-4f31f62c1aea/fleetspeak/src/client/internal/monitoring/resource_usage_monitor.go (about) 1 // Copyright 2017 Google Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // https://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package monitoring 16 17 import ( 18 "context" 19 "errors" 20 "fmt" 21 "math" 22 "time" 23 24 anypb "google.golang.org/protobuf/types/known/anypb" 25 26 log "github.com/golang/glog" 27 "google.golang.org/protobuf/proto" 28 tspb "google.golang.org/protobuf/types/known/timestamppb" 29 30 "github.com/google/fleetspeak/fleetspeak/src/client/internal/process" 31 "github.com/google/fleetspeak/fleetspeak/src/client/service" 32 33 fspb "github.com/google/fleetspeak/fleetspeak/src/common/proto/fleetspeak" 34 mpb "github.com/google/fleetspeak/fleetspeak/src/common/proto/fleetspeak_monitoring" 35 ) 36 37 const ( 38 epsilon float64 = 1e-4 39 defaultSampleSize = 20 40 defaultSamplePeriod = 30 * time.Second 41 ) 42 43 // AggregateResourceUsage is a helper function for aggregating resource-usage data across multiple 44 // resource-usage queries. It should be called once, in sequence, for each ResourceUsage result. 45 // 46 // 'numRUCalls' is the number of resource-usage samples aggregated into one AggregatedResourceUsage 47 // proto; it is used to compute mean metrics. 48 // 'aggRU' is only updated if no error is encountered. 49 // 50 // We don't get memory usage data from finished commands. The commandFinished 51 // bool argument makes this function skip memory usage aggregation. 52 func AggregateResourceUsage(prevRU *ResourceUsage, currRU *ResourceUsage, numRUCalls int, aggRU *mpb.AggregatedResourceUsage, commandFinished bool) error { 53 if numRUCalls < 2 { 54 return errors.New("number of resource-usage calls should be at least 2 (for rate computation)") 55 } 56 if aggRU == nil { 57 return errors.New("aggregated resource-usage proto should not be nil") 58 } 59 60 if prevRU == nil { 61 if !proto.Equal(aggRU, &mpb.AggregatedResourceUsage{}) { 62 return fmt.Errorf( 63 "previous resource-usage is nil, but aggregated proto already has fields set: %v", aggRU) 64 } 65 aggRU.MeanResidentMemory = float64(currRU.ResidentMemory) / float64(numRUCalls) 66 aggRU.MaxResidentMemory = currRU.ResidentMemory 67 aggRU.MeanNumFds = float64(currRU.NumFDs) / float64(numRUCalls) 68 aggRU.MaxNumFds = currRU.NumFDs 69 return nil 70 } 71 72 if !currRU.Timestamp.After(prevRU.Timestamp) { 73 return fmt.Errorf( 74 "timestamp for current resource-usage[%v] should be > that of previous resource-usage[%v]", 75 currRU.Timestamp, prevRU.Timestamp) 76 } 77 78 if err := aggregateTimeResourceUsage(prevRU, currRU, numRUCalls, aggRU); err != nil { 79 return err 80 } 81 82 if commandFinished { 83 return nil 84 } 85 86 aggregateMemoryResourceUsage(currRU, numRUCalls, aggRU) 87 aggregateNumFDsResourceUsage(currRU, numRUCalls, aggRU) 88 89 return nil 90 } 91 92 func aggregateTimeResourceUsage(prevRU *ResourceUsage, currRU *ResourceUsage, numRUCalls int, aggRU *mpb.AggregatedResourceUsage) error { 93 if currRU.UserCPUMillis+epsilon < prevRU.UserCPUMillis { 94 return fmt.Errorf( 95 "cumulative user-mode CPU-usage is not expected to decrease: [%v -> %v]", 96 prevRU.UserCPUMillis, currRU.UserCPUMillis) 97 } 98 99 if currRU.SystemCPUMillis+epsilon < prevRU.SystemCPUMillis { 100 return fmt.Errorf( 101 "cumulative system-mode CPU-usage is not expected to decrease: [%v -> %v]", 102 prevRU.SystemCPUMillis, currRU.SystemCPUMillis) 103 } 104 105 elapsedSecs := currRU.Timestamp.Sub(prevRU.Timestamp).Seconds() 106 userCPURate := (currRU.UserCPUMillis - prevRU.UserCPUMillis) / elapsedSecs 107 systemCPURate := (currRU.SystemCPUMillis - prevRU.SystemCPUMillis) / elapsedSecs 108 109 // Note that since rates are computed between two consecutive data-points, their 110 // average uses a sample size of n - 1, where n is the number of resource-usage queries. 111 aggRU.MeanUserCpuRate += userCPURate / float64(numRUCalls-1) 112 aggRU.MaxUserCpuRate = math.Max(userCPURate, aggRU.MaxUserCpuRate) 113 aggRU.MeanSystemCpuRate += systemCPURate / float64(numRUCalls-1) 114 aggRU.MaxSystemCpuRate = math.Max(systemCPURate, aggRU.MaxSystemCpuRate) 115 116 return nil 117 } 118 119 func aggregateMemoryResourceUsage(currRU *ResourceUsage, numRUCalls int, aggRU *mpb.AggregatedResourceUsage) { 120 // Note that since rates are computed between two consecutive data-points, their 121 // average uses a sample size of n - 1, where n is the number of resource-usage queries. 122 aggRU.MeanResidentMemory += float64(currRU.ResidentMemory) / float64(numRUCalls) 123 if currRU.ResidentMemory > aggRU.MaxResidentMemory { 124 aggRU.MaxResidentMemory = currRU.ResidentMemory 125 } 126 } 127 128 func aggregateNumFDsResourceUsage(currRU *ResourceUsage, numRUCalls int, aggRU *mpb.AggregatedResourceUsage) { 129 aggRU.MeanNumFds += float64(currRU.NumFDs) / float64(numRUCalls) 130 if currRU.NumFDs > aggRU.MaxNumFds { 131 aggRU.MaxNumFds = currRU.NumFDs 132 } 133 } 134 135 // AggregateResourceUsageForFinishedCmd computes resource-usage for a finished process, given 136 // resource-usage before and after the process ran. 137 func AggregateResourceUsageForFinishedCmd(initialRU, finalRU *ResourceUsage) (*mpb.AggregatedResourceUsage, error) { 138 aggRU := mpb.AggregatedResourceUsage{} 139 err := AggregateResourceUsage(nil, initialRU, 2, &aggRU, true) 140 if err != nil { 141 return nil, err 142 } 143 err = AggregateResourceUsage(initialRU, finalRU, 2, &aggRU, true) 144 if err != nil { 145 return nil, err 146 } 147 148 // If this field is untouched, we have not aggregated memory resource usage 149 // for this process yet. We fill it in with what we have. 150 // TODO 151 if aggRU.MaxResidentMemory == 0 { 152 aggRU.MeanResidentMemory = float64(initialRU.ResidentMemory) 153 aggRU.MaxResidentMemory = initialRU.ResidentMemory 154 } 155 156 return &aggRU, nil 157 } 158 159 // Interface for ResourceUsageFetcher, to facilitate stubbing out of the real fetcher in tests. 160 type resourceUsageFetcherI interface { 161 ResourceUsageForPID(pid int) (*ResourceUsage, error) 162 DebugStatusForPID(pid int) (string, error) 163 } 164 165 // ResourceUsageMonitor computes resource-usage metrics for a process and delivers them periodically 166 // via a channel. 167 type ResourceUsageMonitor struct { 168 sc service.Context 169 170 scope string 171 pid int 172 memoryLimit int64 173 version string 174 processStartTime *tspb.Timestamp 175 maxSamplePeriod time.Duration 176 initialSampleSize int 177 sampleSize int 178 179 ruf resourceUsageFetcherI 180 errChan chan<- error 181 doneChan <-chan struct{} 182 } 183 184 // ResourceUsageMonitorParams contains parameters that might be set when 185 // creating a ResourceUsageMonitor. 186 type ResourceUsageMonitorParams struct { 187 // What we are monitoring. Typically a service name, or 'system' for the 188 // Fleetspeak client itself. 189 Scope string 190 191 // The version string of the service that we are monitoring, if known. 192 Version string 193 194 // The process id that we are monitoring. 195 Pid int 196 197 // If nonzero, the monitored process should be killed if it exceeds this 198 // memory limit, in bytes. 199 MemoryLimit int64 200 201 // The time that the processes was started (if known). 202 ProcessStartTime time.Time 203 204 // The longest time to wait between samples. 205 MaxSamplePeriod time.Duration 206 207 // The number of resource-usage query results that get aggregated into 208 // a single resource-usage report sent to Fleetspeak servers. 209 SampleSize int 210 211 // If set, the resource monitor will report errors on this channel. If unset, 212 // errors will be logged. 213 Err chan<- error 214 215 // If set, stubs out the actual resource fetching. Meant for use only in unit tests. 216 ruf resourceUsageFetcherI 217 } 218 219 // New returns a new ResourceUsageMonitor. 220 // Once created, it must be started with Run(). 221 func New(sc service.Context, params ResourceUsageMonitorParams) (*ResourceUsageMonitor, error) { 222 var startTimeProto *tspb.Timestamp 223 224 if !params.ProcessStartTime.IsZero() { 225 startTimeProto = tspb.New(params.ProcessStartTime) 226 if err := startTimeProto.CheckValid(); err != nil { 227 return nil, fmt.Errorf("process start time is invalid: %v", err) 228 } 229 } 230 231 if params.SampleSize == 0 { 232 params.SampleSize = defaultSampleSize 233 } 234 if params.MaxSamplePeriod == 0 { 235 params.MaxSamplePeriod = defaultSamplePeriod 236 } 237 if params.SampleSize < 2 { 238 return nil, fmt.Errorf("sample size %d invalid - must be at least 2 (for rate computation)", params.SampleSize) 239 } 240 241 maxSamplePeriodSecs := int(params.MaxSamplePeriod / time.Second) 242 var backoffSize int 243 if maxSamplePeriodSecs == 0 { 244 backoffSize = 0 245 } else { 246 backoffSize = int(math.Log2(float64(maxSamplePeriodSecs))) 247 } 248 // First sample is bigger because of the backoff. 249 initialSampleSize := params.SampleSize + backoffSize 250 251 if params.ruf == nil { 252 params.ruf = ResourceUsageFetcher{} 253 } 254 255 m := ResourceUsageMonitor{ 256 sc: sc, 257 258 scope: params.Scope, 259 pid: params.Pid, 260 memoryLimit: params.MemoryLimit, 261 version: params.Version, 262 processStartTime: startTimeProto, 263 maxSamplePeriod: params.MaxSamplePeriod, 264 initialSampleSize: initialSampleSize, 265 sampleSize: params.SampleSize, 266 267 ruf: params.ruf, 268 errChan: params.Err, 269 } 270 271 return &m, nil 272 } 273 274 // Run is the business method of the resource-usage monitor. 275 // It blocks until ctx is canceled. 276 func (m *ResourceUsageMonitor) Run(ctx context.Context) { 277 ctx, cancel := context.WithCancel(ctx) 278 defer cancel() 279 280 // 1s, 2s, 4s, 8s, 16s, ..., m.maxSamplePeriod, m.maxSamplePeriod, m.maxSamplePeriod, ... 281 backoffPeriod := min(time.Second, m.maxSamplePeriod) 282 backoff := time.NewTimer(backoffPeriod) 283 defer backoff.Stop() 284 initialSample := true 285 286 var prevRU *ResourceUsage 287 aggRU := mpb.AggregatedResourceUsage{} 288 numSamplesCollected := 0 289 290 resetSamples := func() { 291 prevRU = nil 292 aggRU = mpb.AggregatedResourceUsage{} 293 numSamplesCollected = 0 294 initialSample = false 295 } 296 297 for { 298 select { 299 case <-ctx.Done(): 300 return 301 case <-backoff.C: 302 backoffPeriod = min(backoffPeriod*2, m.maxSamplePeriod) 303 backoff.Reset(backoffPeriod) 304 305 currRU, err := m.ruf.ResourceUsageForPID(m.pid) 306 if err != nil { 307 m.errorf("failed to get resource usage for process[%d]: %v", m.pid, err) 308 resetSamples() 309 continue 310 } 311 312 if !m.enforceMemoryLimit(ctx, currRU.ResidentMemory) { 313 resetSamples() 314 continue 315 } 316 317 var ss int 318 if initialSample { 319 ss = m.initialSampleSize 320 } else { 321 ss = m.sampleSize 322 } 323 324 err = AggregateResourceUsage(prevRU, currRU, ss, &aggRU, false) 325 if err != nil { 326 m.errorf("aggregation error: %v", err) 327 resetSamples() 328 continue 329 } 330 331 prevRU = currRU 332 numSamplesCollected++ 333 334 // Sample size reached. 335 if numSamplesCollected == ss { 336 debugStatus, err := m.ruf.DebugStatusForPID(m.pid) 337 if err != nil { 338 m.errorf("failed to get debug status for process[%d]: %v", m.pid, err) 339 } 340 rud := &mpb.ResourceUsageData{ 341 Scope: m.scope, 342 Pid: int64(m.pid), 343 ProcessStartTime: m.processStartTime, 344 Version: m.version, 345 DataTimestamp: tspb.Now(), 346 ResourceUsage: &aggRU, 347 DebugStatus: debugStatus, 348 } 349 if err := SendProtoToServer(ctx, rud, "ResourceUsage", m.sc); err != nil { 350 m.errorf("failed to send resource-usage data to the server: %v", err) 351 } 352 resetSamples() 353 } 354 } 355 } 356 } 357 358 // enforceMemoryLimit kills the monitored process if the given memory usage exceeds the configured limit. 359 // A boolean is returned, which is true if the memory usage is below the limit. 360 func (m *ResourceUsageMonitor) enforceMemoryLimit(ctx context.Context, currResidentMemory int64) bool { 361 if m.memoryLimit <= 0 || currResidentMemory < m.memoryLimit { 362 return true 363 } 364 // m.scope is the service name here. 365 log.Warningf("Memory limit (%d bytes) exceeded for %s; pid %d, killing.", m.memoryLimit, m.scope, m.pid) 366 367 // Send notification to server before killing the process (which could be the Fleetspeak process). 368 kn := &mpb.KillNotification{ 369 Service: m.scope, 370 Pid: int64(m.pid), 371 Version: m.version, 372 ProcessStartTime: m.processStartTime, 373 KilledWhen: tspb.Now(), 374 Reason: mpb.KillNotification_MEMORY_EXCEEDED, 375 } 376 if err := SendProtoToServer(ctx, kn, "KillNotification", m.sc); err != nil { 377 log.Errorf("Failed to send kill notification to server: %v", err) 378 } 379 380 if err := process.KillByPid(m.pid); err != nil { 381 log.Errorf("Error while killing a process that exceeded its memory limit (%d bytes) - %s pid %d: %v", m.memoryLimit, m.scope, m.pid, err) 382 } 383 return false 384 } 385 386 func (m *ResourceUsageMonitor) errorf(format string, a ...any) { 387 err := fmt.Errorf(format, a...) 388 if m.errChan == nil { 389 log.Errorf("Resource-usage monitor encountered an error: %v", err) 390 } else { 391 m.errChan <- err 392 } 393 } 394 395 // SendProtoToServer wraps a proto in a fspb.Message and sends it to the server. 396 func SendProtoToServer(ctx context.Context, pb proto.Message, msgType string, sc service.Context) error { 397 d, err := anypb.New(pb) 398 if err != nil { 399 return err 400 } 401 ctx, cancel := context.WithTimeout(ctx, 30*time.Second) 402 defer cancel() 403 return sc.Send(ctx, service.AckMessage{ 404 M: &fspb.Message{ 405 Destination: &fspb.Address{ServiceName: "system"}, 406 MessageType: msgType, 407 Data: d, 408 Priority: fspb.Message_LOW, 409 Background: true, 410 }, 411 }) 412 }