github.com/bigcommerce/nomad@v0.9.3-bc/client/stats/host.go (about)

     1  package stats
     2  
     3  import (
     4  	"fmt"
     5  	"math"
     6  	"runtime"
     7  	"sync"
     8  	"time"
     9  
    10  	hclog "github.com/hashicorp/go-hclog"
    11  	"github.com/hashicorp/nomad/plugins/device"
    12  	"github.com/shirou/gopsutil/cpu"
    13  	"github.com/shirou/gopsutil/disk"
    14  	"github.com/shirou/gopsutil/host"
    15  	"github.com/shirou/gopsutil/mem"
    16  )
    17  
    18  // HostStats represents resource usage stats of the host running a Nomad client
    19  type HostStats struct {
    20  	Memory           *MemoryStats
    21  	CPU              []*CPUStats
    22  	DiskStats        []*DiskStats
    23  	AllocDirStats    *DiskStats
    24  	DeviceStats      []*DeviceGroupStats
    25  	Uptime           uint64
    26  	Timestamp        int64
    27  	CPUTicksConsumed float64
    28  }
    29  
    30  // MemoryStats represents stats related to virtual memory usage
    31  type MemoryStats struct {
    32  	Total     uint64
    33  	Available uint64
    34  	Used      uint64
    35  	Free      uint64
    36  }
    37  
    38  // CPUStats represents stats related to cpu usage
    39  type CPUStats struct {
    40  	CPU    string
    41  	User   float64
    42  	System float64
    43  	Idle   float64
    44  	Total  float64
    45  }
    46  
    47  // DiskStats represents stats related to disk usage
    48  type DiskStats struct {
    49  	Device            string
    50  	Mountpoint        string
    51  	Size              uint64
    52  	Used              uint64
    53  	Available         uint64
    54  	UsedPercent       float64
    55  	InodesUsedPercent float64
    56  }
    57  
    58  // DeviceGroupStats represents stats related to device group
    59  type DeviceGroupStats = device.DeviceGroupStats
    60  
    61  // DeviceStatsCollector is used to retrieve all the latest statistics for all devices.
    62  type DeviceStatsCollector func() []*DeviceGroupStats
    63  
    64  // NodeStatsCollector is an interface which is used for the purposes of mocking
    65  // the HostStatsCollector in the tests
    66  type NodeStatsCollector interface {
    67  	Collect() error
    68  	Stats() *HostStats
    69  }
    70  
    71  // HostStatsCollector collects host resource usage stats
    72  type HostStatsCollector struct {
    73  	numCores             int
    74  	statsCalculator      map[string]*HostCpuStatsCalculator
    75  	hostStats            *HostStats
    76  	hostStatsLock        sync.RWMutex
    77  	allocDir             string
    78  	deviceStatsCollector DeviceStatsCollector
    79  
    80  	// badParts is a set of partitions whose usage cannot be read; used to
    81  	// squelch logspam.
    82  	badParts map[string]struct{}
    83  
    84  	logger hclog.Logger
    85  }
    86  
    87  // NewHostStatsCollector returns a HostStatsCollector. The allocDir is passed in
    88  // so that we can present the disk related statistics for the mountpoint where
    89  // the allocation directory lives
    90  func NewHostStatsCollector(logger hclog.Logger, allocDir string, deviceStatsCollector DeviceStatsCollector) *HostStatsCollector {
    91  	logger = logger.Named("host_stats")
    92  	numCores := runtime.NumCPU()
    93  	statsCalculator := make(map[string]*HostCpuStatsCalculator)
    94  	collector := &HostStatsCollector{
    95  		statsCalculator:      statsCalculator,
    96  		numCores:             numCores,
    97  		logger:               logger,
    98  		allocDir:             allocDir,
    99  		badParts:             make(map[string]struct{}),
   100  		deviceStatsCollector: deviceStatsCollector,
   101  	}
   102  	return collector
   103  }
   104  
   105  // Collect collects stats related to resource usage of a host
   106  func (h *HostStatsCollector) Collect() error {
   107  	h.hostStatsLock.Lock()
   108  	defer h.hostStatsLock.Unlock()
   109  	return h.collectLocked()
   110  }
   111  
   112  // collectLocked collects stats related to resource usage of the host but should
   113  // be called with the lock held.
   114  func (h *HostStatsCollector) collectLocked() error {
   115  	hs := &HostStats{Timestamp: time.Now().UTC().UnixNano()}
   116  
   117  	// Determine up-time
   118  	uptime, err := host.Uptime()
   119  	if err != nil {
   120  		return err
   121  	}
   122  	hs.Uptime = uptime
   123  
   124  	// Collect memory stats
   125  	mstats, err := h.collectMemoryStats()
   126  	if err != nil {
   127  		return err
   128  	}
   129  	hs.Memory = mstats
   130  
   131  	// Collect cpu stats
   132  	cpus, ticks, err := h.collectCPUStats()
   133  	if err != nil {
   134  		return err
   135  	}
   136  	hs.CPU = cpus
   137  	hs.CPUTicksConsumed = ticks
   138  
   139  	// Collect disk stats
   140  	diskStats, err := h.collectDiskStats()
   141  	if err != nil {
   142  		return err
   143  	}
   144  	hs.DiskStats = diskStats
   145  
   146  	// Getting the disk stats for the allocation directory
   147  	usage, err := disk.Usage(h.allocDir)
   148  	if err != nil {
   149  		return fmt.Errorf("failed to find disk usage of alloc_dir %q: %v", h.allocDir, err)
   150  	}
   151  	hs.AllocDirStats = h.toDiskStats(usage, nil)
   152  
   153  	// Collect devices stats
   154  	deviceStats := h.collectDeviceGroupStats()
   155  	hs.DeviceStats = deviceStats
   156  
   157  	// Update the collected status object.
   158  	h.hostStats = hs
   159  
   160  	return nil
   161  }
   162  
   163  func (h *HostStatsCollector) collectMemoryStats() (*MemoryStats, error) {
   164  	memStats, err := mem.VirtualMemory()
   165  	if err != nil {
   166  		return nil, err
   167  	}
   168  	mem := &MemoryStats{
   169  		Total:     memStats.Total,
   170  		Available: memStats.Available,
   171  		Used:      memStats.Used,
   172  		Free:      memStats.Free,
   173  	}
   174  
   175  	return mem, nil
   176  }
   177  
   178  func (h *HostStatsCollector) collectDiskStats() ([]*DiskStats, error) {
   179  	partitions, err := disk.Partitions(false)
   180  	if err != nil {
   181  		return nil, err
   182  	}
   183  
   184  	var diskStats []*DiskStats
   185  	for _, partition := range partitions {
   186  		usage, err := disk.Usage(partition.Mountpoint)
   187  		if err != nil {
   188  			if _, ok := h.badParts[partition.Mountpoint]; ok {
   189  				// already known bad, don't log again
   190  				continue
   191  			}
   192  
   193  			h.badParts[partition.Mountpoint] = struct{}{}
   194  			h.logger.Warn("error fetching host disk usage stats", "error", err, "partition", partition.Mountpoint)
   195  			continue
   196  		}
   197  		delete(h.badParts, partition.Mountpoint)
   198  
   199  		ds := h.toDiskStats(usage, &partition)
   200  		diskStats = append(diskStats, ds)
   201  	}
   202  
   203  	return diskStats, nil
   204  }
   205  
   206  func (h *HostStatsCollector) collectDeviceGroupStats() []*DeviceGroupStats {
   207  	if h.deviceStatsCollector == nil {
   208  		return []*DeviceGroupStats{}
   209  	}
   210  
   211  	return h.deviceStatsCollector()
   212  }
   213  
   214  // Stats returns the host stats that has been collected
   215  func (h *HostStatsCollector) Stats() *HostStats {
   216  	h.hostStatsLock.RLock()
   217  	defer h.hostStatsLock.RUnlock()
   218  
   219  	if h.hostStats == nil {
   220  		if err := h.collectLocked(); err != nil {
   221  			h.logger.Warn("error fetching host resource usage stats", "error", err)
   222  		}
   223  	}
   224  
   225  	return h.hostStats
   226  }
   227  
   228  // toDiskStats merges UsageStat and PartitionStat to create a DiskStat
   229  func (h *HostStatsCollector) toDiskStats(usage *disk.UsageStat, partitionStat *disk.PartitionStat) *DiskStats {
   230  	ds := DiskStats{
   231  		Size:              usage.Total,
   232  		Used:              usage.Used,
   233  		Available:         usage.Free,
   234  		UsedPercent:       usage.UsedPercent,
   235  		InodesUsedPercent: usage.InodesUsedPercent,
   236  	}
   237  	if math.IsNaN(ds.UsedPercent) {
   238  		ds.UsedPercent = 0.0
   239  	}
   240  	if math.IsNaN(ds.InodesUsedPercent) {
   241  		ds.InodesUsedPercent = 0.0
   242  	}
   243  
   244  	if partitionStat != nil {
   245  		ds.Device = partitionStat.Device
   246  		ds.Mountpoint = partitionStat.Mountpoint
   247  	}
   248  
   249  	return &ds
   250  }
   251  
   252  // HostCpuStatsCalculator calculates cpu usage percentages
   253  type HostCpuStatsCalculator struct {
   254  	prevIdle   float64
   255  	prevUser   float64
   256  	prevSystem float64
   257  	prevBusy   float64
   258  	prevTotal  float64
   259  }
   260  
   261  // NewHostCpuStatsCalculator returns a HostCpuStatsCalculator
   262  func NewHostCpuStatsCalculator() *HostCpuStatsCalculator {
   263  	return &HostCpuStatsCalculator{}
   264  }
   265  
   266  // Calculate calculates the current cpu usage percentages
   267  func (h *HostCpuStatsCalculator) Calculate(times cpu.TimesStat) (idle float64, user float64, system float64, total float64) {
   268  	currentIdle := times.Idle
   269  	currentUser := times.User
   270  	currentSystem := times.System
   271  	currentTotal := times.Total()
   272  	currentBusy := times.User + times.System + times.Nice + times.Iowait + times.Irq +
   273  		times.Softirq + times.Steal + times.Guest + times.GuestNice + times.Stolen
   274  
   275  	deltaTotal := currentTotal - h.prevTotal
   276  	idle = ((currentIdle - h.prevIdle) / deltaTotal) * 100
   277  	user = ((currentUser - h.prevUser) / deltaTotal) * 100
   278  	system = ((currentSystem - h.prevSystem) / deltaTotal) * 100
   279  	total = ((currentBusy - h.prevBusy) / deltaTotal) * 100
   280  
   281  	// Protect against any invalid values
   282  	if math.IsNaN(idle) || math.IsInf(idle, 0) {
   283  		idle = 100.0
   284  	}
   285  	if math.IsNaN(user) || math.IsInf(user, 0) {
   286  		user = 0.0
   287  	}
   288  	if math.IsNaN(system) || math.IsInf(system, 0) {
   289  		system = 0.0
   290  	}
   291  	if math.IsNaN(total) || math.IsInf(total, 0) {
   292  		total = 0.0
   293  	}
   294  
   295  	h.prevIdle = currentIdle
   296  	h.prevUser = currentUser
   297  	h.prevSystem = currentSystem
   298  	h.prevTotal = currentTotal
   299  	h.prevBusy = currentBusy
   300  	return
   301  }