github.com/livekit/protocol@v1.16.1-0.20240517185851-47e4c6bba773/utils/hwstats/cpu.go (about)

     1  // Copyright 2023 LiveKit, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package hwstats
    16  
    17  import (
    18  	"time"
    19  
    20  	"github.com/frostbyte73/core"
    21  	"github.com/prometheus/procfs"
    22  	"go.uber.org/atomic"
    23  
    24  	"github.com/livekit/protocol/logger"
    25  )
    26  
    27  // This object returns cgroup quota aware cpu stats. On other systems than Linux,
    28  // it falls back to full system stats
    29  
    30  type platformCPUMonitor interface {
    31  	getCPUIdle() (float64, error)
    32  	numCPU() float64
    33  }
    34  
    35  type CPUStats struct {
    36  	idleCPUs atomic.Float64
    37  	platform platformCPUMonitor
    38  
    39  	idleCallback    func(idle float64)
    40  	procCallback    func(idle float64, usage map[int]float64)
    41  	warningThrottle core.Throttle
    42  	closeChan       chan struct{}
    43  }
    44  
    45  func NewCPUStats(idleUpdateCallback func(idle float64)) (*CPUStats, error) {
    46  	p, err := newPlatformCPUMonitor()
    47  	if err != nil {
    48  		return nil, err
    49  	}
    50  
    51  	c := &CPUStats{
    52  		platform:        p,
    53  		warningThrottle: core.NewThrottle(time.Minute),
    54  		idleCallback:    idleUpdateCallback,
    55  		closeChan:       make(chan struct{}),
    56  	}
    57  
    58  	go c.monitorCPULoad()
    59  
    60  	return c, nil
    61  }
    62  
    63  func NewProcCPUStats(procUpdateCallback func(idle float64, usage map[int]float64)) (*CPUStats, error) {
    64  	p, err := newPlatformCPUMonitor()
    65  	if err != nil {
    66  		return nil, err
    67  	}
    68  
    69  	c := &CPUStats{
    70  		platform:        p,
    71  		warningThrottle: core.NewThrottle(time.Minute),
    72  		procCallback:    procUpdateCallback,
    73  		closeChan:       make(chan struct{}),
    74  	}
    75  
    76  	go c.monitorProcCPULoad()
    77  
    78  	return c, nil
    79  }
    80  
    81  func (c *CPUStats) GetCPUIdle() float64 {
    82  	return c.idleCPUs.Load()
    83  }
    84  
    85  func (c *CPUStats) NumCPU() float64 {
    86  	return c.platform.numCPU()
    87  }
    88  
    89  func (c *CPUStats) Stop() {
    90  	close(c.closeChan)
    91  }
    92  
    93  func (c *CPUStats) monitorCPULoad() {
    94  	ticker := time.NewTicker(time.Second)
    95  	defer ticker.Stop()
    96  
    97  	for {
    98  		select {
    99  		case <-c.closeChan:
   100  			return
   101  		case <-ticker.C:
   102  			idle, err := c.platform.getCPUIdle()
   103  			if err != nil {
   104  				logger.Errorw("failed retrieving CPU idle", err)
   105  				continue
   106  			}
   107  
   108  			c.idleCPUs.Store(idle)
   109  			idleRatio := idle / c.platform.numCPU()
   110  
   111  			if idleRatio < 0.1 {
   112  				c.warningThrottle(func() { logger.Infow("high cpu load", "load", 1-idleRatio) })
   113  			}
   114  
   115  			if c.idleCallback != nil {
   116  				c.idleCallback(idle)
   117  			}
   118  		}
   119  	}
   120  }
   121  
   122  func (c *CPUStats) monitorProcCPULoad() {
   123  	numCPU := c.platform.numCPU()
   124  
   125  	fs, err := procfs.NewFS(procfs.DefaultMountPoint)
   126  	if err != nil {
   127  		logger.Errorw("failed to read proc fs", err)
   128  		return
   129  	}
   130  	hostCPU, err := getHostCPUCount(fs)
   131  	if err != nil {
   132  		logger.Errorw("failed to read pod cpu count", err)
   133  		return
   134  	}
   135  
   136  	self, err := fs.Self()
   137  	if err != nil {
   138  		logger.Errorw("failed to read self", err)
   139  		return
   140  	}
   141  
   142  	ticker := time.NewTicker(time.Second)
   143  	defer ticker.Stop()
   144  
   145  	var prevTotalTime float64
   146  	var prevStats map[int]procfs.ProcStat
   147  	for {
   148  		select {
   149  		case <-c.closeChan:
   150  			return
   151  		case <-ticker.C:
   152  			procStats := make(map[int]procfs.ProcStat)
   153  			procs, err := procfs.AllProcs()
   154  			if err != nil {
   155  				logger.Errorw("failed to read processes", err)
   156  				continue
   157  			}
   158  
   159  			total, err := fs.Stat()
   160  			if err != nil {
   161  				logger.Errorw("failed to read stats", err)
   162  				continue
   163  			}
   164  
   165  			ppids := make(map[int]int)
   166  			for _, proc := range procs {
   167  				stat, err := proc.Stat()
   168  				if err != nil {
   169  					continue
   170  				}
   171  
   172  				procStats[proc.PID] = stat
   173  				if proc.PID != self.PID {
   174  					ppids[proc.PID] = stat.PPID
   175  				}
   176  			}
   177  
   178  			totalHostTime := total.CPUTotal.Idle + total.CPUTotal.Iowait +
   179  				total.CPUTotal.User + total.CPUTotal.Nice + total.CPUTotal.System +
   180  				total.CPUTotal.IRQ + total.CPUTotal.SoftIRQ + total.CPUTotal.Steal
   181  
   182  			usage := make(map[int]float64)
   183  			podUsage := 0.0
   184  			for pid, stat := range procStats {
   185  				// process usage as percent of total host cpu
   186  				procPercentUsage := float64(stat.UTime + stat.STime - prevStats[pid].UTime - prevStats[pid].STime)
   187  				if procPercentUsage == 0 {
   188  					continue
   189  				}
   190  
   191  				for ppids[pid] != self.PID && ppids[pid] != 0 {
   192  					// bundle usage up to first child of main go process
   193  					pid = ppids[pid]
   194  				}
   195  
   196  				procUsage := hostCPU * procPercentUsage / 100 / (totalHostTime - prevTotalTime)
   197  				usage[pid] += procUsage
   198  				podUsage += procUsage
   199  			}
   200  
   201  			idle := numCPU - podUsage
   202  			c.idleCPUs.Store(idle)
   203  
   204  			if c.procCallback != nil {
   205  				c.procCallback(idle, usage)
   206  			}
   207  
   208  			prevTotalTime = totalHostTime
   209  			prevStats = procStats
   210  		}
   211  	}
   212  }