bosun.org@v0.0.0-20210513094433-e25bc3e69a1f/cmd/scollector/collectors/processes_linux.go (about)

     1  package collectors
     2  
     3  import (
     4  	"fmt"
     5  	"io/ioutil"
     6  	"os"
     7  	"regexp"
     8  	"sort"
     9  	"strconv"
    10  	"strings"
    11  	"time"
    12  
    13  	"bosun.org/cmd/scollector/conf"
    14  	"bosun.org/metadata"
    15  	"bosun.org/opentsdb"
    16  )
    17  
    18  func AddProcessConfig(params conf.ProcessParams) error {
    19  	p, err := NewWatchedProc(params)
    20  	if err != nil {
    21  		return err
    22  	}
    23  	watchedProcs = append(watchedProcs, p)
    24  	return nil
    25  }
    26  
    27  var watchedProcs = []*WatchedProc{}
    28  
    29  var osPageSize = os.Getpagesize()
    30  
    31  // linuxCoreCount counts the number of logical cpus since that is how cpu ticks
    32  // are tracked
    33  func linuxCoreCount() (c int64, err error) {
    34  	if err = readLine("/proc/cpuinfo", func(s string) (err error) {
    35  		f := strings.Fields(s)
    36  		if len(f) > 2 && f[0] == "processor" {
    37  			c++
    38  			return
    39  		}
    40  		return
    41  	}); err != nil {
    42  		return c, fmt.Errorf("failed to read /proc/cpuinfo to get cpu core count: %v", err)
    43  	}
    44  	if c == 0 {
    45  		return c, fmt.Errorf("got a core count of 0, expected at least one core")
    46  	}
    47  	return
    48  }
    49  
    50  func WatchProcesses() {
    51  	if len(watchedProcs) == 0 {
    52  		return
    53  	}
    54  	collectors = append(collectors, &IntervalCollector{
    55  		F: func() (opentsdb.MultiDataPoint, error) {
    56  			return c_linux_processes(watchedProcs)
    57  		},
    58  		name: "c_linux_processes",
    59  	})
    60  }
    61  
    62  func linuxProcMonitor(w *WatchedProc, md *opentsdb.MultiDataPoint) error {
    63  	var err error
    64  	var processCount int
    65  	var totalCPU int64
    66  	var totalVirtualMem int64
    67  	var totalRSSMem int64
    68  	for proc, id := range w.Processes {
    69  		pid := proc.Pid
    70  		file_status, e := os.Stat("/proc/" + pid)
    71  		if e != nil {
    72  			w.Remove(proc)
    73  			continue
    74  		}
    75  		processCount++
    76  		stats_file, e := ioutil.ReadFile("/proc/" + pid + "/stat")
    77  		if e != nil {
    78  			w.Remove(proc)
    79  			continue
    80  		}
    81  		io_file, e := ioutil.ReadFile("/proc/" + pid + "/io")
    82  		if e != nil {
    83  			w.Remove(proc)
    84  			continue
    85  		}
    86  		limits, e := ioutil.ReadFile("/proc/" + pid + "/limits")
    87  		if e != nil {
    88  			w.Remove(proc)
    89  			continue
    90  		}
    91  		fd_dir, e := os.Open("/proc/" + pid + "/fd")
    92  		if e != nil {
    93  			w.Remove(proc)
    94  			continue
    95  		}
    96  		fds, e := fd_dir.Readdirnames(0)
    97  		fd_dir.Close()
    98  		if e != nil {
    99  			w.Remove(proc)
   100  			continue
   101  		}
   102  		stats := strings.Fields(string(stats_file))
   103  		if len(stats) < 24 {
   104  			err = fmt.Errorf("stats too short")
   105  			continue
   106  		}
   107  		var io []string
   108  		for _, line := range strings.Split(string(io_file), "\n") {
   109  			f := strings.Fields(line)
   110  			if len(f) == 2 {
   111  				io = append(io, f[1])
   112  			}
   113  		}
   114  		if len(io) < 6 {
   115  			err = fmt.Errorf("io too short")
   116  			continue
   117  		}
   118  		tags := opentsdb.TagSet{"name": w.Name, "id": strconv.Itoa(id)}
   119  		for _, line := range strings.Split(string(limits), "\n") {
   120  			f := strings.Fields(line)
   121  			if len(f) == 6 && strings.Join(f[0:3], " ") == "Max open files" {
   122  				if f[3] != "unlimited" {
   123  					Add(md, "linux.proc.num_fds_slim", f[3], tags, metadata.Gauge, metadata.Files, descLinuxSoftFileLimit)
   124  					Add(md, "linux.proc.num_fds_hlim", f[4], tags, metadata.Gauge, metadata.Files, descLinuxHardFileLimit)
   125  				}
   126  			}
   127  		}
   128  		start_ts := file_status.ModTime().Unix()
   129  		user, err := strconv.ParseInt(stats[13], 10, 64)
   130  		if err != nil {
   131  			return fmt.Errorf("failed to convert process user cpu: %v", err)
   132  		}
   133  		sys, err := strconv.ParseInt(stats[14], 10, 64)
   134  		if err != nil {
   135  			return fmt.Errorf("failed to convert process system cpu: %v", err)
   136  		}
   137  		totalCPU += user + sys
   138  		Add(md, "linux.proc.cpu", stats[13], opentsdb.TagSet{"type": "user"}.Merge(tags), metadata.Counter, metadata.Pct, descLinuxProcCPUUser)
   139  		Add(md, "linux.proc.cpu", stats[14], opentsdb.TagSet{"type": "system"}.Merge(tags), metadata.Counter, metadata.Pct, descLinuxProcCPUSystem)
   140  		Add(md, "linux.proc.mem.fault", stats[9], opentsdb.TagSet{"type": "minflt"}.Merge(tags), metadata.Counter, metadata.Fault, descLinuxProcMemFaultMin)
   141  		Add(md, "linux.proc.mem.fault", stats[11], opentsdb.TagSet{"type": "majflt"}.Merge(tags), metadata.Counter, metadata.Fault, descLinuxProcMemFaultMax)
   142  		virtual, err := strconv.ParseInt(stats[22], 10, 64)
   143  		if err != nil {
   144  			return fmt.Errorf("failed to convert process virtual memory: %v", err)
   145  		}
   146  		totalVirtualMem += virtual
   147  		rss, err := strconv.ParseInt(stats[23], 10, 64)
   148  		if err != nil {
   149  			return fmt.Errorf("failed to convert process rss memory: %v", err)
   150  		}
   151  		if pid == strconv.Itoa(os.Getpid()) {
   152  			TotalScollectorMemoryMB = uint64(rss) * uint64(osPageSize) / 1024 / 1024
   153  		}
   154  		totalRSSMem += rss
   155  		Add(md, "linux.proc.mem.virtual", stats[22], tags, metadata.Gauge, metadata.Bytes, descLinuxProcMemVirtual)
   156  		Add(md, "linux.proc.mem.rss", stats[23], tags, metadata.Gauge, metadata.Page, descLinuxProcMemRss)
   157  		Add(md, "linux.proc.mem.rss_bytes", rss*int64(osPageSize), tags, metadata.Gauge, metadata.Bytes, descLinuxProcMemRssBytes)
   158  		Add(md, "linux.proc.char_io", io[0], opentsdb.TagSet{"type": "read"}.Merge(tags), metadata.Counter, metadata.Bytes, descLinuxProcCharIoRead)
   159  		Add(md, "linux.proc.char_io", io[1], opentsdb.TagSet{"type": "write"}.Merge(tags), metadata.Counter, metadata.Bytes, descLinuxProcCharIoWrite)
   160  		Add(md, "linux.proc.syscall", io[2], opentsdb.TagSet{"type": "read"}.Merge(tags), metadata.Counter, metadata.Syscall, descLinuxProcSyscallRead)
   161  		Add(md, "linux.proc.syscall", io[3], opentsdb.TagSet{"type": "write"}.Merge(tags), metadata.Counter, metadata.Syscall, descLinuxProcSyscallWrite)
   162  		Add(md, "linux.proc.io_bytes", io[4], opentsdb.TagSet{"type": "read"}.Merge(tags), metadata.Counter, metadata.Bytes, descLinuxProcIoBytesRead)
   163  		Add(md, "linux.proc.io_bytes", io[5], opentsdb.TagSet{"type": "write"}.Merge(tags), metadata.Counter, metadata.Bytes, descLinuxProcIoBytesWrite)
   164  		Add(md, "linux.proc.num_fds", len(fds), tags, metadata.Gauge, metadata.Files, descLinuxProcFd)
   165  		Add(md, "linux.proc.start_time", start_ts, tags, metadata.Gauge, metadata.Timestamp, descLinuxProcStartTS)
   166  		Add(md, "linux.proc.uptime", now()-start_ts, tags, metadata.Gauge, metadata.Second, descLinuxProcUptime)
   167  		Add(md, "linux.proc.pid", pid, tags, metadata.Gauge, metadata.Unit("PID"), osProcPID)
   168  	}
   169  	coreCount, err := linuxCoreCount()
   170  	if err != nil {
   171  		return fmt.Errorf("failed to get core count: %v", err)
   172  	}
   173  	tsName := opentsdb.TagSet{"name": w.Name}
   174  	if processCount > 0 {
   175  		Add(md, osProcCPU, float64(totalCPU)/float64(coreCount), tsName, metadata.Counter, metadata.Pct, osProcCPUDesc)
   176  		Add(md, osProcMemReal, totalRSSMem*int64(os.Getpagesize()), tsName, metadata.Gauge, metadata.Bytes, osProcMemRealDesc)
   177  		Add(md, osProcMemVirtual, totalVirtualMem, tsName, metadata.Gauge, metadata.Bytes, osProcMemVirtualDesc)
   178  		Add(md, osProcCount, processCount, tsName, metadata.Gauge, metadata.Process, osProcCountDesc)
   179  	}
   180  	if w.IncludeCount {
   181  		Add(md, "linux.proc.count", processCount, tsName, metadata.Gauge, metadata.Process, descLinuxProcCount)
   182  	}
   183  	return err
   184  }
   185  
   186  const (
   187  	descLinuxProcCPUUser      = "The amount of time that this process has been scheduled in user mode."
   188  	descLinuxProcCPUSystem    = "The amount of time that this process has been scheduled in kernel mode"
   189  	descLinuxProcMemFaultMin  = "The number of minor faults the process has made which have not required loading a memory page from disk."
   190  	descLinuxProcMemFaultMax  = "The number of major faults the process has made which have required loading a memory page from disk."
   191  	descLinuxProcMemVirtual   = "The virtual memory size."
   192  	descLinuxProcMemRss       = "The resident set size (number of pages the process has in real memory including shared pages)."
   193  	descLinuxProcMemRssBytes  = "The resident set size (number of bytes the process has in real memory including shared pages)."
   194  	descLinuxProcCharIoRead   = "The number of bytes which this task has caused to be read from storage. This is simply the sum of bytes which this process passed to read(2) and similar system calls. It includes things such as terminal I/O and is unaffected by whether or not actual physical disk I/O was required (the read might have been satisfied from pagecache)"
   195  	descLinuxProcCharIoWrite  = "The number of bytes which this task has caused, or shall cause to be written to disk. Similar caveats apply here as with read."
   196  	descLinuxProcSyscallRead  = "An attempt to count the number of read I/O operations—that is, system calls such as read(2) and pread(2)."
   197  	descLinuxProcSyscallWrite = "Attempt to count the number of write I/O operations—that is, system calls such as write(2) and pwrite(2)."
   198  	descLinuxProcIoBytesRead  = "An attempt to count the number of bytes which this process really did cause to be fetched from the storage layer. This is accurate for block-backed filesystems."
   199  	descLinuxProcIoBytesWrite = "An Attempt to count the number of bytes which this process caused to be sent to the storage layer."
   200  	descLinuxProcFd           = "The number of open file descriptors."
   201  	descLinuxSoftFileLimit    = "The soft limit on the number of open file descriptors."
   202  	descLinuxHardFileLimit    = "The hard limit on the number of open file descriptors."
   203  	descLinuxProcUptime       = "The length of time, in seconds, since the process was started."
   204  	descLinuxProcStartTS      = "The timestamp of process start."
   205  	descLinuxProcCount        = "The number of currently running processes."
   206  )
   207  
   208  type byModTime []os.FileInfo
   209  
   210  func (bmt byModTime) Len() int      { return len(bmt) }
   211  func (bmt byModTime) Swap(i, j int) { bmt[i], bmt[j] = bmt[j], bmt[i] }
   212  func (bmt byModTime) Less(i, j int) bool {
   213  	// If the creation times are identical, sort by filename (pid) instead.
   214  	if bmt[i].ModTime() == bmt[j].ModTime() {
   215  		return sort.StringsAreSorted([]string{bmt[i].Name(), bmt[j].Name()})
   216  	}
   217  	return bmt[i].ModTime().UnixNano() < bmt[j].ModTime().UnixNano()
   218  }
   219  
   220  func getLinuxProccesses() ([]*Process, error) {
   221  	files, err := ioutil.ReadDir("/proc")
   222  	if err != nil {
   223  		return nil, err
   224  	}
   225  	sort.Sort(byModTime(files))
   226  	var pidFiles []os.FileInfo
   227  	for _, f := range files {
   228  		if _, err := strconv.Atoi(f.Name()); err == nil && f.IsDir() {
   229  			pidFiles = append(pidFiles, f)
   230  		}
   231  	}
   232  	var lps []*Process
   233  	for _, pidFile := range pidFiles {
   234  		cl, err := getLinuxCmdline(pidFile.Name())
   235  		if err != nil || cl == nil {
   236  			//Continue because the pid might not exist any more
   237  			continue
   238  		}
   239  		lp := &Process{
   240  			Pid:     pidFile.Name(),
   241  			Command: cl[0],
   242  			Started: pidFile.ModTime(),
   243  		}
   244  		if len(cl) > 1 {
   245  			lp.Arguments = strings.Join(cl[1:], "")
   246  		}
   247  		lps = append(lps, lp)
   248  	}
   249  	return lps, nil
   250  }
   251  
   252  func getLinuxCmdline(pid string) ([]string, error) {
   253  	cmdline, err := ioutil.ReadFile("/proc/" + pid + "/cmdline")
   254  	if err != nil {
   255  		return nil, err
   256  	}
   257  	cl := strings.Split(string(cmdline), "\x00")
   258  	if len(cl) < 1 || len(cl[0]) == 0 {
   259  		return nil, nil
   260  	}
   261  	return cl, nil
   262  }
   263  
   264  func c_linux_processes(procs []*WatchedProc) (opentsdb.MultiDataPoint, error) {
   265  	var md opentsdb.MultiDataPoint
   266  	lps, err := getLinuxProccesses()
   267  	if err != nil {
   268  		return nil, nil
   269  	}
   270  	for _, w := range procs {
   271  		w.Check(lps)
   272  		if e := linuxProcMonitor(w, &md); e != nil {
   273  			err = e
   274  		}
   275  	}
   276  	return md, err
   277  }
   278  
   279  type Process struct {
   280  	Pid       string
   281  	Command   string
   282  	Arguments string
   283  	Started   time.Time
   284  }
   285  
   286  // NewWatchedProc takes a configuration block [[Process]] from conf
   287  func NewWatchedProc(params conf.ProcessParams) (*WatchedProc, error) {
   288  	if params.Name == "" {
   289  		params.Name = params.Command
   290  	}
   291  	if !opentsdb.ValidTSDBString(params.Name) {
   292  		return nil, fmt.Errorf("bad process name: %v", params.Name)
   293  	}
   294  	return &WatchedProc{
   295  		Command:      regexp.MustCompile(params.Command),
   296  		Name:         params.Name,
   297  		IncludeCount: params.IncludeCount,
   298  		Processes:    make(map[Process]int),
   299  		ArgMatch:     regexp.MustCompile(params.Args),
   300  		idPool:       new(idPool),
   301  	}, nil
   302  }
   303  
   304  type WatchedProc struct {
   305  	Command      *regexp.Regexp
   306  	Name         string
   307  	IncludeCount bool
   308  	Processes    map[Process]int
   309  	ArgMatch     *regexp.Regexp
   310  	*idPool
   311  }
   312  
   313  // Check finds all matching processes and assigns them a new unique id. If
   314  // WatchedProc has processes that no longer exist, it removes them from
   315  // WatchedProc.Processes.
   316  func (w *WatchedProc) Check(procs []*Process) {
   317  	procFound := make(map[Process]bool)
   318  	for _, l := range procs {
   319  		if _, ok := w.Processes[*l]; ok {
   320  			procFound[*l] = true
   321  			continue
   322  		}
   323  		if !w.Command.MatchString(l.Command) {
   324  			continue
   325  		}
   326  		if !w.ArgMatch.MatchString(l.Arguments) {
   327  			continue
   328  		}
   329  		w.Processes[*l] = w.get()
   330  		procFound[*l] = true
   331  	}
   332  	for proc := range w.Processes {
   333  		if !procFound[proc] {
   334  			w.Remove(proc)
   335  		}
   336  	}
   337  }
   338  
   339  func (w *WatchedProc) Remove(proc Process) {
   340  	w.put(w.Processes[proc])
   341  	delete(w.Processes, proc)
   342  }
   343  
   344  type idPool struct {
   345  	free []int
   346  	next int
   347  }
   348  
   349  func (i *idPool) get() int {
   350  	if len(i.free) == 0 {
   351  		i.next++
   352  		return i.next
   353  	}
   354  	sort.Ints(i.free)
   355  
   356  	var newId int
   357  	newId, i.free = i.free[0], i.free[1:]
   358  
   359  	return newId
   360  }
   361  
   362  func (i *idPool) put(v int) {
   363  	i.free = append(i.free, v)
   364  }
   365  
   366  // InContainer detects if a process is running in a Linux container.
   367  func InContainer(pid string) bool {
   368  	pidNameSpaceFile := fmt.Sprintf("/proc/%v/ns/pid", pid)
   369  	if pidNameSpace, err := os.Readlink(pidNameSpaceFile); err == nil {
   370  		if initNameSpace, err := os.Readlink("/proc/1/ns/pid"); err == nil {
   371  			return initNameSpace != pidNameSpace
   372  		}
   373  	}
   374  	return false
   375  }