bosun.org@v0.0.0-20210513094433-e25bc3e69a1f/cmd/scollector/collectors/processes_linux.go (about) 1 package collectors 2 3 import ( 4 "fmt" 5 "io/ioutil" 6 "os" 7 "regexp" 8 "sort" 9 "strconv" 10 "strings" 11 "time" 12 13 "bosun.org/cmd/scollector/conf" 14 "bosun.org/metadata" 15 "bosun.org/opentsdb" 16 ) 17 18 func AddProcessConfig(params conf.ProcessParams) error { 19 p, err := NewWatchedProc(params) 20 if err != nil { 21 return err 22 } 23 watchedProcs = append(watchedProcs, p) 24 return nil 25 } 26 27 var watchedProcs = []*WatchedProc{} 28 29 var osPageSize = os.Getpagesize() 30 31 // linuxCoreCount counts the number of logical cpus since that is how cpu ticks 32 // are tracked 33 func linuxCoreCount() (c int64, err error) { 34 if err = readLine("/proc/cpuinfo", func(s string) (err error) { 35 f := strings.Fields(s) 36 if len(f) > 2 && f[0] == "processor" { 37 c++ 38 return 39 } 40 return 41 }); err != nil { 42 return c, fmt.Errorf("failed to read /proc/cpuinfo to get cpu core count: %v", err) 43 } 44 if c == 0 { 45 return c, fmt.Errorf("got a core count of 0, expected at least one core") 46 } 47 return 48 } 49 50 func WatchProcesses() { 51 if len(watchedProcs) == 0 { 52 return 53 } 54 collectors = append(collectors, &IntervalCollector{ 55 F: func() (opentsdb.MultiDataPoint, error) { 56 return c_linux_processes(watchedProcs) 57 }, 58 name: "c_linux_processes", 59 }) 60 } 61 62 func linuxProcMonitor(w *WatchedProc, md *opentsdb.MultiDataPoint) error { 63 var err error 64 var processCount int 65 var totalCPU int64 66 var totalVirtualMem int64 67 var totalRSSMem int64 68 for proc, id := range w.Processes { 69 pid := proc.Pid 70 file_status, e := os.Stat("/proc/" + pid) 71 if e != nil { 72 w.Remove(proc) 73 continue 74 } 75 processCount++ 76 stats_file, e := ioutil.ReadFile("/proc/" + pid + "/stat") 77 if e != nil { 78 w.Remove(proc) 79 continue 80 } 81 io_file, e := ioutil.ReadFile("/proc/" + pid + "/io") 82 if e != nil { 83 w.Remove(proc) 84 continue 85 } 86 limits, e := ioutil.ReadFile("/proc/" + pid + "/limits") 87 if e != nil { 88 w.Remove(proc) 89 continue 90 } 91 fd_dir, e := os.Open("/proc/" + pid + "/fd") 92 if e != nil { 93 w.Remove(proc) 94 continue 95 } 96 fds, e := fd_dir.Readdirnames(0) 97 fd_dir.Close() 98 if e != nil { 99 w.Remove(proc) 100 continue 101 } 102 stats := strings.Fields(string(stats_file)) 103 if len(stats) < 24 { 104 err = fmt.Errorf("stats too short") 105 continue 106 } 107 var io []string 108 for _, line := range strings.Split(string(io_file), "\n") { 109 f := strings.Fields(line) 110 if len(f) == 2 { 111 io = append(io, f[1]) 112 } 113 } 114 if len(io) < 6 { 115 err = fmt.Errorf("io too short") 116 continue 117 } 118 tags := opentsdb.TagSet{"name": w.Name, "id": strconv.Itoa(id)} 119 for _, line := range strings.Split(string(limits), "\n") { 120 f := strings.Fields(line) 121 if len(f) == 6 && strings.Join(f[0:3], " ") == "Max open files" { 122 if f[3] != "unlimited" { 123 Add(md, "linux.proc.num_fds_slim", f[3], tags, metadata.Gauge, metadata.Files, descLinuxSoftFileLimit) 124 Add(md, "linux.proc.num_fds_hlim", f[4], tags, metadata.Gauge, metadata.Files, descLinuxHardFileLimit) 125 } 126 } 127 } 128 start_ts := file_status.ModTime().Unix() 129 user, err := strconv.ParseInt(stats[13], 10, 64) 130 if err != nil { 131 return fmt.Errorf("failed to convert process user cpu: %v", err) 132 } 133 sys, err := strconv.ParseInt(stats[14], 10, 64) 134 if err != nil { 135 return fmt.Errorf("failed to convert process system cpu: %v", err) 136 } 137 totalCPU += user + sys 138 Add(md, "linux.proc.cpu", stats[13], opentsdb.TagSet{"type": "user"}.Merge(tags), metadata.Counter, metadata.Pct, descLinuxProcCPUUser) 139 Add(md, "linux.proc.cpu", stats[14], opentsdb.TagSet{"type": "system"}.Merge(tags), metadata.Counter, metadata.Pct, descLinuxProcCPUSystem) 140 Add(md, "linux.proc.mem.fault", stats[9], opentsdb.TagSet{"type": "minflt"}.Merge(tags), metadata.Counter, metadata.Fault, descLinuxProcMemFaultMin) 141 Add(md, "linux.proc.mem.fault", stats[11], opentsdb.TagSet{"type": "majflt"}.Merge(tags), metadata.Counter, metadata.Fault, descLinuxProcMemFaultMax) 142 virtual, err := strconv.ParseInt(stats[22], 10, 64) 143 if err != nil { 144 return fmt.Errorf("failed to convert process virtual memory: %v", err) 145 } 146 totalVirtualMem += virtual 147 rss, err := strconv.ParseInt(stats[23], 10, 64) 148 if err != nil { 149 return fmt.Errorf("failed to convert process rss memory: %v", err) 150 } 151 if pid == strconv.Itoa(os.Getpid()) { 152 TotalScollectorMemoryMB = uint64(rss) * uint64(osPageSize) / 1024 / 1024 153 } 154 totalRSSMem += rss 155 Add(md, "linux.proc.mem.virtual", stats[22], tags, metadata.Gauge, metadata.Bytes, descLinuxProcMemVirtual) 156 Add(md, "linux.proc.mem.rss", stats[23], tags, metadata.Gauge, metadata.Page, descLinuxProcMemRss) 157 Add(md, "linux.proc.mem.rss_bytes", rss*int64(osPageSize), tags, metadata.Gauge, metadata.Bytes, descLinuxProcMemRssBytes) 158 Add(md, "linux.proc.char_io", io[0], opentsdb.TagSet{"type": "read"}.Merge(tags), metadata.Counter, metadata.Bytes, descLinuxProcCharIoRead) 159 Add(md, "linux.proc.char_io", io[1], opentsdb.TagSet{"type": "write"}.Merge(tags), metadata.Counter, metadata.Bytes, descLinuxProcCharIoWrite) 160 Add(md, "linux.proc.syscall", io[2], opentsdb.TagSet{"type": "read"}.Merge(tags), metadata.Counter, metadata.Syscall, descLinuxProcSyscallRead) 161 Add(md, "linux.proc.syscall", io[3], opentsdb.TagSet{"type": "write"}.Merge(tags), metadata.Counter, metadata.Syscall, descLinuxProcSyscallWrite) 162 Add(md, "linux.proc.io_bytes", io[4], opentsdb.TagSet{"type": "read"}.Merge(tags), metadata.Counter, metadata.Bytes, descLinuxProcIoBytesRead) 163 Add(md, "linux.proc.io_bytes", io[5], opentsdb.TagSet{"type": "write"}.Merge(tags), metadata.Counter, metadata.Bytes, descLinuxProcIoBytesWrite) 164 Add(md, "linux.proc.num_fds", len(fds), tags, metadata.Gauge, metadata.Files, descLinuxProcFd) 165 Add(md, "linux.proc.start_time", start_ts, tags, metadata.Gauge, metadata.Timestamp, descLinuxProcStartTS) 166 Add(md, "linux.proc.uptime", now()-start_ts, tags, metadata.Gauge, metadata.Second, descLinuxProcUptime) 167 Add(md, "linux.proc.pid", pid, tags, metadata.Gauge, metadata.Unit("PID"), osProcPID) 168 } 169 coreCount, err := linuxCoreCount() 170 if err != nil { 171 return fmt.Errorf("failed to get core count: %v", err) 172 } 173 tsName := opentsdb.TagSet{"name": w.Name} 174 if processCount > 0 { 175 Add(md, osProcCPU, float64(totalCPU)/float64(coreCount), tsName, metadata.Counter, metadata.Pct, osProcCPUDesc) 176 Add(md, osProcMemReal, totalRSSMem*int64(os.Getpagesize()), tsName, metadata.Gauge, metadata.Bytes, osProcMemRealDesc) 177 Add(md, osProcMemVirtual, totalVirtualMem, tsName, metadata.Gauge, metadata.Bytes, osProcMemVirtualDesc) 178 Add(md, osProcCount, processCount, tsName, metadata.Gauge, metadata.Process, osProcCountDesc) 179 } 180 if w.IncludeCount { 181 Add(md, "linux.proc.count", processCount, tsName, metadata.Gauge, metadata.Process, descLinuxProcCount) 182 } 183 return err 184 } 185 186 const ( 187 descLinuxProcCPUUser = "The amount of time that this process has been scheduled in user mode." 188 descLinuxProcCPUSystem = "The amount of time that this process has been scheduled in kernel mode" 189 descLinuxProcMemFaultMin = "The number of minor faults the process has made which have not required loading a memory page from disk." 190 descLinuxProcMemFaultMax = "The number of major faults the process has made which have required loading a memory page from disk." 191 descLinuxProcMemVirtual = "The virtual memory size." 192 descLinuxProcMemRss = "The resident set size (number of pages the process has in real memory including shared pages)." 193 descLinuxProcMemRssBytes = "The resident set size (number of bytes the process has in real memory including shared pages)." 194 descLinuxProcCharIoRead = "The number of bytes which this task has caused to be read from storage. This is simply the sum of bytes which this process passed to read(2) and similar system calls. It includes things such as terminal I/O and is unaffected by whether or not actual physical disk I/O was required (the read might have been satisfied from pagecache)" 195 descLinuxProcCharIoWrite = "The number of bytes which this task has caused, or shall cause to be written to disk. Similar caveats apply here as with read." 196 descLinuxProcSyscallRead = "An attempt to count the number of read I/O operations—that is, system calls such as read(2) and pread(2)." 197 descLinuxProcSyscallWrite = "Attempt to count the number of write I/O operations—that is, system calls such as write(2) and pwrite(2)." 198 descLinuxProcIoBytesRead = "An attempt to count the number of bytes which this process really did cause to be fetched from the storage layer. This is accurate for block-backed filesystems." 199 descLinuxProcIoBytesWrite = "An Attempt to count the number of bytes which this process caused to be sent to the storage layer." 200 descLinuxProcFd = "The number of open file descriptors." 201 descLinuxSoftFileLimit = "The soft limit on the number of open file descriptors." 202 descLinuxHardFileLimit = "The hard limit on the number of open file descriptors." 203 descLinuxProcUptime = "The length of time, in seconds, since the process was started." 204 descLinuxProcStartTS = "The timestamp of process start." 205 descLinuxProcCount = "The number of currently running processes." 206 ) 207 208 type byModTime []os.FileInfo 209 210 func (bmt byModTime) Len() int { return len(bmt) } 211 func (bmt byModTime) Swap(i, j int) { bmt[i], bmt[j] = bmt[j], bmt[i] } 212 func (bmt byModTime) Less(i, j int) bool { 213 // If the creation times are identical, sort by filename (pid) instead. 214 if bmt[i].ModTime() == bmt[j].ModTime() { 215 return sort.StringsAreSorted([]string{bmt[i].Name(), bmt[j].Name()}) 216 } 217 return bmt[i].ModTime().UnixNano() < bmt[j].ModTime().UnixNano() 218 } 219 220 func getLinuxProccesses() ([]*Process, error) { 221 files, err := ioutil.ReadDir("/proc") 222 if err != nil { 223 return nil, err 224 } 225 sort.Sort(byModTime(files)) 226 var pidFiles []os.FileInfo 227 for _, f := range files { 228 if _, err := strconv.Atoi(f.Name()); err == nil && f.IsDir() { 229 pidFiles = append(pidFiles, f) 230 } 231 } 232 var lps []*Process 233 for _, pidFile := range pidFiles { 234 cl, err := getLinuxCmdline(pidFile.Name()) 235 if err != nil || cl == nil { 236 //Continue because the pid might not exist any more 237 continue 238 } 239 lp := &Process{ 240 Pid: pidFile.Name(), 241 Command: cl[0], 242 Started: pidFile.ModTime(), 243 } 244 if len(cl) > 1 { 245 lp.Arguments = strings.Join(cl[1:], "") 246 } 247 lps = append(lps, lp) 248 } 249 return lps, nil 250 } 251 252 func getLinuxCmdline(pid string) ([]string, error) { 253 cmdline, err := ioutil.ReadFile("/proc/" + pid + "/cmdline") 254 if err != nil { 255 return nil, err 256 } 257 cl := strings.Split(string(cmdline), "\x00") 258 if len(cl) < 1 || len(cl[0]) == 0 { 259 return nil, nil 260 } 261 return cl, nil 262 } 263 264 func c_linux_processes(procs []*WatchedProc) (opentsdb.MultiDataPoint, error) { 265 var md opentsdb.MultiDataPoint 266 lps, err := getLinuxProccesses() 267 if err != nil { 268 return nil, nil 269 } 270 for _, w := range procs { 271 w.Check(lps) 272 if e := linuxProcMonitor(w, &md); e != nil { 273 err = e 274 } 275 } 276 return md, err 277 } 278 279 type Process struct { 280 Pid string 281 Command string 282 Arguments string 283 Started time.Time 284 } 285 286 // NewWatchedProc takes a configuration block [[Process]] from conf 287 func NewWatchedProc(params conf.ProcessParams) (*WatchedProc, error) { 288 if params.Name == "" { 289 params.Name = params.Command 290 } 291 if !opentsdb.ValidTSDBString(params.Name) { 292 return nil, fmt.Errorf("bad process name: %v", params.Name) 293 } 294 return &WatchedProc{ 295 Command: regexp.MustCompile(params.Command), 296 Name: params.Name, 297 IncludeCount: params.IncludeCount, 298 Processes: make(map[Process]int), 299 ArgMatch: regexp.MustCompile(params.Args), 300 idPool: new(idPool), 301 }, nil 302 } 303 304 type WatchedProc struct { 305 Command *regexp.Regexp 306 Name string 307 IncludeCount bool 308 Processes map[Process]int 309 ArgMatch *regexp.Regexp 310 *idPool 311 } 312 313 // Check finds all matching processes and assigns them a new unique id. If 314 // WatchedProc has processes that no longer exist, it removes them from 315 // WatchedProc.Processes. 316 func (w *WatchedProc) Check(procs []*Process) { 317 procFound := make(map[Process]bool) 318 for _, l := range procs { 319 if _, ok := w.Processes[*l]; ok { 320 procFound[*l] = true 321 continue 322 } 323 if !w.Command.MatchString(l.Command) { 324 continue 325 } 326 if !w.ArgMatch.MatchString(l.Arguments) { 327 continue 328 } 329 w.Processes[*l] = w.get() 330 procFound[*l] = true 331 } 332 for proc := range w.Processes { 333 if !procFound[proc] { 334 w.Remove(proc) 335 } 336 } 337 } 338 339 func (w *WatchedProc) Remove(proc Process) { 340 w.put(w.Processes[proc]) 341 delete(w.Processes, proc) 342 } 343 344 type idPool struct { 345 free []int 346 next int 347 } 348 349 func (i *idPool) get() int { 350 if len(i.free) == 0 { 351 i.next++ 352 return i.next 353 } 354 sort.Ints(i.free) 355 356 var newId int 357 newId, i.free = i.free[0], i.free[1:] 358 359 return newId 360 } 361 362 func (i *idPool) put(v int) { 363 i.free = append(i.free, v) 364 } 365 366 // InContainer detects if a process is running in a Linux container. 367 func InContainer(pid string) bool { 368 pidNameSpaceFile := fmt.Sprintf("/proc/%v/ns/pid", pid) 369 if pidNameSpace, err := os.Readlink(pidNameSpaceFile); err == nil { 370 if initNameSpace, err := os.Readlink("/proc/1/ns/pid"); err == nil { 371 return initNameSpace != pidNameSpace 372 } 373 } 374 return false 375 }