github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/drivers/shared/executor/pid_collector.go (about) 1 package executor 2 3 import ( 4 "os" 5 "strconv" 6 "sync" 7 "time" 8 9 hclog "github.com/hashicorp/go-hclog" 10 "github.com/hashicorp/nomad/client/stats" 11 "github.com/hashicorp/nomad/plugins/drivers" 12 ps "github.com/mitchellh/go-ps" 13 "github.com/shirou/gopsutil/process" 14 ) 15 16 var ( 17 // pidScanInterval is the interval at which the executor scans the process 18 // tree for finding out the pids that the executor and it's child processes 19 // have forked 20 pidScanInterval = 5 * time.Second 21 ) 22 23 // pidCollector is a utility that can be embedded in an executor to collect pid 24 // stats 25 type pidCollector struct { 26 pids map[int]*nomadPid 27 pidLock sync.RWMutex 28 logger hclog.Logger 29 } 30 31 // nomadPid holds a pid and it's cpu percentage calculator 32 type nomadPid struct { 33 pid int 34 cpuStatsTotal *stats.CpuStats 35 cpuStatsUser *stats.CpuStats 36 cpuStatsSys *stats.CpuStats 37 } 38 39 // allPidGetter is a func which is used by the pid collector to gather 40 // stats on 41 type allPidGetter func() (map[int]*nomadPid, error) 42 43 func newPidCollector(logger hclog.Logger) *pidCollector { 44 return &pidCollector{ 45 pids: make(map[int]*nomadPid), 46 logger: logger.Named("pid_collector"), 47 } 48 } 49 50 // collectPids collects the pids of the child processes that the executor is 51 // running every 5 seconds 52 func (c *pidCollector) collectPids(stopCh chan interface{}, pidGetter allPidGetter) { 53 // Fire the timer right away when the executor starts from there on the pids 54 // are collected every scan interval 55 timer := time.NewTimer(0) 56 defer timer.Stop() 57 for { 58 select { 59 case <-timer.C: 60 pids, err := pidGetter() 61 if err != nil { 62 c.logger.Debug("error collecting pids", "error", err) 63 } 64 c.pidLock.Lock() 65 66 // Adding pids which are not being tracked 67 for pid, np := range pids { 68 if _, ok := c.pids[pid]; !ok { 69 c.pids[pid] = np 70 } 71 } 72 // Removing pids which are no longer present 73 for pid := range c.pids { 74 if _, ok := pids[pid]; !ok { 75 delete(c.pids, pid) 76 } 77 } 78 c.pidLock.Unlock() 79 timer.Reset(pidScanInterval) 80 case <-stopCh: 81 return 82 } 83 } 84 } 85 86 // scanPids scans all the pids on the machine running the current executor and 87 // returns the child processes of the executor. 88 func scanPids(parentPid int, allPids []ps.Process) (map[int]*nomadPid, error) { 89 processFamily := make(map[int]struct{}) 90 processFamily[parentPid] = struct{}{} 91 92 // A mapping of pids to their parent pids. It is used to build the process 93 // tree of the executing task 94 pidsRemaining := make(map[int]int, len(allPids)) 95 for _, pid := range allPids { 96 pidsRemaining[pid.Pid()] = pid.PPid() 97 } 98 99 for { 100 // flag to indicate if we have found a match 101 foundNewPid := false 102 103 for pid, ppid := range pidsRemaining { 104 _, childPid := processFamily[ppid] 105 106 // checking if the pid is a child of any of the parents 107 if childPid { 108 processFamily[pid] = struct{}{} 109 delete(pidsRemaining, pid) 110 foundNewPid = true 111 } 112 } 113 114 // not scanning anymore if we couldn't find a single match 115 if !foundNewPid { 116 break 117 } 118 } 119 120 res := make(map[int]*nomadPid) 121 for pid := range processFamily { 122 np := nomadPid{ 123 pid: pid, 124 cpuStatsTotal: stats.NewCpuStats(), 125 cpuStatsUser: stats.NewCpuStats(), 126 cpuStatsSys: stats.NewCpuStats(), 127 } 128 res[pid] = &np 129 } 130 return res, nil 131 } 132 133 // pidStats returns the resource usage stats per pid 134 func (c *pidCollector) pidStats() (map[string]*drivers.ResourceUsage, error) { 135 stats := make(map[string]*drivers.ResourceUsage) 136 c.pidLock.RLock() 137 pids := make(map[int]*nomadPid, len(c.pids)) 138 for k, v := range c.pids { 139 pids[k] = v 140 } 141 c.pidLock.RUnlock() 142 for pid, np := range pids { 143 p, err := process.NewProcess(int32(pid)) 144 if err != nil { 145 c.logger.Trace("unable to create new process", "pid", pid, "error", err) 146 continue 147 } 148 ms := &drivers.MemoryStats{} 149 if memInfo, err := p.MemoryInfo(); err == nil { 150 ms.RSS = memInfo.RSS 151 ms.Swap = memInfo.Swap 152 ms.Measured = ExecutorBasicMeasuredMemStats 153 } 154 155 cs := &drivers.CpuStats{} 156 if cpuStats, err := p.Times(); err == nil { 157 cs.SystemMode = np.cpuStatsSys.Percent(cpuStats.System * float64(time.Second)) 158 cs.UserMode = np.cpuStatsUser.Percent(cpuStats.User * float64(time.Second)) 159 cs.Measured = ExecutorBasicMeasuredCpuStats 160 161 // calculate cpu usage percent 162 cs.Percent = np.cpuStatsTotal.Percent(cpuStats.Total() * float64(time.Second)) 163 } 164 stats[strconv.Itoa(pid)] = &drivers.ResourceUsage{MemoryStats: ms, CpuStats: cs} 165 } 166 167 return stats, nil 168 } 169 170 // aggregatedResourceUsage aggregates the resource usage of all the pids and 171 // returns a TaskResourceUsage data point 172 func aggregatedResourceUsage(systemCpuStats *stats.CpuStats, pidStats map[string]*drivers.ResourceUsage) *drivers.TaskResourceUsage { 173 ts := time.Now().UTC().UnixNano() 174 var ( 175 systemModeCPU, userModeCPU, percent float64 176 totalRSS, totalSwap uint64 177 ) 178 179 for _, pidStat := range pidStats { 180 systemModeCPU += pidStat.CpuStats.SystemMode 181 userModeCPU += pidStat.CpuStats.UserMode 182 percent += pidStat.CpuStats.Percent 183 184 totalRSS += pidStat.MemoryStats.RSS 185 totalSwap += pidStat.MemoryStats.Swap 186 } 187 188 totalCPU := &drivers.CpuStats{ 189 SystemMode: systemModeCPU, 190 UserMode: userModeCPU, 191 Percent: percent, 192 Measured: ExecutorBasicMeasuredCpuStats, 193 TotalTicks: systemCpuStats.TicksConsumed(percent), 194 } 195 196 totalMemory := &drivers.MemoryStats{ 197 RSS: totalRSS, 198 Swap: totalSwap, 199 Measured: ExecutorBasicMeasuredMemStats, 200 } 201 202 resourceUsage := drivers.ResourceUsage{ 203 MemoryStats: totalMemory, 204 CpuStats: totalCPU, 205 } 206 return &drivers.TaskResourceUsage{ 207 ResourceUsage: &resourceUsage, 208 Timestamp: ts, 209 Pids: pidStats, 210 } 211 } 212 213 func getAllPidsByScanning() (map[int]*nomadPid, error) { 214 allProcesses, err := ps.Processes() 215 if err != nil { 216 return nil, err 217 } 218 return scanPids(os.Getpid(), allProcesses) 219 }