bosun.org@v0.0.0-20210513094433-e25bc3e69a1f/cmd/scollector/collectors/processes_windows.go (about)

     1  package collectors
     2  
     3  import (
     4  	"fmt"
     5  	"os"
     6  	"regexp"
     7  	"strings"
     8  
     9  	"bosun.org/cmd/scollector/conf"
    10  	"bosun.org/metadata"
    11  	"bosun.org/opentsdb"
    12  	"bosun.org/util"
    13  	"github.com/StackExchange/wmi"
    14  )
    15  
    16  var regexesProcesses = []*regexp.Regexp{}
    17  
    18  func AddProcessConfig(params conf.ProcessParams) error {
    19  	if params.Name == "" {
    20  		return fmt.Errorf("empty process Name")
    21  	}
    22  	reg, err := regexp.Compile(params.Name)
    23  	if err != nil {
    24  		return err
    25  	}
    26  	regexesProcesses = append(regexesProcesses, reg)
    27  	return nil
    28  }
    29  
    30  func WatchProcesses() {
    31  	if len(regexesProcesses) == 0 {
    32  		// if no process settings configured in config file, use this set instead.
    33  		regexesProcesses = append(regexesProcesses, regexp.MustCompile("chrome|powershell|scollector|WinRM|MSSQLSERVER"))
    34  	}
    35  	collectors = append(collectors, &IntervalCollector{
    36  		F: c_windows_processes,
    37  	})
    38  }
    39  
    40  func c_windows_processes() (opentsdb.MultiDataPoint, error) {
    41  	var dst []Win32_PerfRawData_PerfProc_Process
    42  	var q = wmi.CreateQuery(&dst, `WHERE Name <> '_Total'`)
    43  	err := queryWmi(q, &dst)
    44  	if err != nil {
    45  		return nil, err
    46  	}
    47  
    48  	var svc_dst []Win32_Service
    49  	var svc_q = wmi.CreateQuery(&svc_dst, "")
    50  	err = queryWmi(svc_q, &svc_dst)
    51  	if err != nil {
    52  		return nil, err
    53  	}
    54  
    55  	var iis_dst []WorkerProcess
    56  	iis_q := wmi.CreateQuery(&iis_dst, "")
    57  	err = queryWmiNamespace(iis_q, &iis_dst, "root\\WebAdministration")
    58  	if err != nil {
    59  		// Don't return from this error since the name space might exist.
    60  		iis_dst = nil
    61  	}
    62  
    63  	var numberOfLogicalProcessors uint64
    64  	var core_dst []Win32_ComputerSystem
    65  	var core_q = wmi.CreateQuery(&core_dst, "")
    66  	err = queryWmi(core_q, &core_dst)
    67  	if err != nil {
    68  		return nil, err
    69  	}
    70  	for _, y := range core_dst {
    71  		numberOfLogicalProcessors = uint64(y.NumberOfLogicalProcessors)
    72  	}
    73  	if numberOfLogicalProcessors == 0 {
    74  		return nil, fmt.Errorf("invalid result: numberOfLogicalProcessors=%v", numberOfLogicalProcessors)
    75  	}
    76  
    77  	var md opentsdb.MultiDataPoint
    78  	var svc_dst_started []Win32_Service
    79  	for _, svc := range svc_dst {
    80  		if util.NameMatches(svc.Name, regexesProcesses) {
    81  			if svc.Started {
    82  				svc_dst_started = append(svc_dst_started, svc)
    83  			}
    84  			tags := opentsdb.TagSet{"name": svc.Name}
    85  			Add(&md, "win.service.started", util.Btoi(svc.Started), tags, metadata.Gauge, metadata.Bool, descWinServiceStarted)
    86  			Add(&md, "win.service.status", util.Btoi(svc.Status != "OK"), tags, metadata.Gauge, metadata.Ok, descWinServiceStatus)
    87  			Add(&md, "win.service.checkpoint", svc.CheckPoint, tags, metadata.Gauge, metadata.None, descWinServiceCheckPoint)
    88  			Add(&md, "win.service.wait_hint", svc.WaitHint, tags, metadata.Gauge, metadata.MilliSecond, descWinServiceWaitHint)
    89  			Add(&md, osServiceRunning, util.Btoi(svc.Started), tags, metadata.Gauge, metadata.Bool, osServiceRunningDesc)
    90  		}
    91  	}
    92  
    93  	totalCPUByName := make(map[string]uint64)
    94  	totalVirtualMemByName := make(map[string]uint64)
    95  	totalPrivateWSMemByName := make(map[string]uint64)
    96  	countByName := make(map[string]int)
    97  
    98  	for _, v := range dst {
    99  		var name string
   100  		service_match := false
   101  		iis_match := false
   102  
   103  		process_match := util.NameMatches(v.Name, regexesProcesses)
   104  
   105  		id := "0"
   106  
   107  		if process_match {
   108  			raw_name := strings.Split(v.Name, "#")
   109  			name = raw_name[0]
   110  			if len(raw_name) == 2 {
   111  				id = raw_name[1]
   112  			}
   113  			// If you have a hash sign in your process name you don't deserve monitoring ;-)
   114  			if len(raw_name) > 2 {
   115  				continue
   116  			}
   117  		}
   118  
   119  		// A Service match could "overwrite" a process match, but that is probably what we would want
   120  		for _, svc := range svc_dst_started {
   121  			// It is possible the pid has gone and been reused, but I think this unlikely
   122  			// And I'm not aware of an atomic join we could do anyways
   123  			if svc.ProcessId != 0 && svc.ProcessId == v.IDProcess {
   124  				id = "0"
   125  				service_match = true
   126  				name = svc.Name
   127  				break
   128  			}
   129  		}
   130  
   131  		for _, a_pool := range iis_dst {
   132  			if a_pool.ProcessId == v.IDProcess {
   133  				id = "0"
   134  				iis_match = true
   135  				name = strings.Join([]string{"iis", a_pool.AppPoolName}, "_")
   136  				break
   137  			}
   138  		}
   139  
   140  		if v.IDProcess == uint32(os.Getpid()) {
   141  			TotalScollectorMemoryMB = v.WorkingSetPrivate / 1024 / 1024
   142  		}
   143  
   144  		if !(service_match || process_match || iis_match) {
   145  			continue
   146  		}
   147  
   148  		//Use timestamp from WMI to fix issues with CPU metrics
   149  		ts := TSys100NStoEpoch(v.Timestamp_Sys100NS)
   150  		tags := opentsdb.TagSet{"name": name, "id": id}
   151  		AddTS(&md, "win.proc.cpu", ts, v.PercentPrivilegedTime/NS100_Seconds/numberOfLogicalProcessors, opentsdb.TagSet{"type": "privileged"}.Merge(tags), metadata.Counter, metadata.Pct, descWinProcCPU_priv)
   152  		AddTS(&md, "win.proc.cpu", ts, v.PercentUserTime/NS100_Seconds/numberOfLogicalProcessors, opentsdb.TagSet{"type": "user"}.Merge(tags), metadata.Counter, metadata.Pct, descWinProcCPU_user)
   153  		totalCPUByName[name] += v.PercentUserTime / NS100_Seconds / numberOfLogicalProcessors
   154  		AddTS(&md, "win.proc.cpu_total", ts, v.PercentProcessorTime/NS100_Seconds/numberOfLogicalProcessors, tags, metadata.Counter, metadata.Pct, descWinProcCPU_total)
   155  		if v.Frequency_Object != 0 {
   156  			Add(&md, "win.proc.elapsed_time", (v.Timestamp_Object-v.ElapsedTime)/v.Frequency_Object, tags, metadata.Gauge, metadata.Second, descWinProcElapsed_time)
   157  		}
   158  		Add(&md, "win.proc.handle_count", v.HandleCount, tags, metadata.Gauge, metadata.Count, descWinProcHandle_count)
   159  		Add(&md, "win.proc.io_bytes", v.IOOtherBytesPersec, opentsdb.TagSet{"type": "other"}.Merge(tags), metadata.Counter, metadata.BytesPerSecond, descWinProcIo_bytes_other)
   160  		Add(&md, "win.proc.io_bytes", v.IOReadBytesPersec, opentsdb.TagSet{"type": "read"}.Merge(tags), metadata.Counter, metadata.BytesPerSecond, descWinProcIo_bytes_read)
   161  		Add(&md, "win.proc.io_bytes", v.IOWriteBytesPersec, opentsdb.TagSet{"type": "write"}.Merge(tags), metadata.Counter, metadata.BytesPerSecond, descWinProcIo_bytes_write)
   162  		Add(&md, "win.proc.io_operations", v.IOOtherOperationsPersec, opentsdb.TagSet{"type": "other"}.Merge(tags), metadata.Counter, metadata.Operation, descWinProcIo_operations)
   163  		Add(&md, "win.proc.io_operations", v.IOReadOperationsPersec, opentsdb.TagSet{"type": "read"}.Merge(tags), metadata.Counter, metadata.Operation, descWinProcIo_operations_read)
   164  		Add(&md, "win.proc.io_operations", v.IOWriteOperationsPersec, opentsdb.TagSet{"type": "write"}.Merge(tags), metadata.Counter, metadata.Operation, descWinProcIo_operations_write)
   165  		Add(&md, "win.proc.mem.page_faults", v.PageFaultsPersec, tags, metadata.Counter, metadata.PerSecond, descWinProcMemPage_faults)
   166  		Add(&md, "win.proc.mem.pagefile_bytes", v.PageFileBytes, tags, metadata.Gauge, metadata.Bytes, descWinProcMemPagefile_bytes)
   167  		Add(&md, "win.proc.mem.pagefile_bytes_peak", v.PageFileBytesPeak, tags, metadata.Gauge, metadata.Bytes, descWinProcMemPagefile_bytes_peak)
   168  		Add(&md, "win.proc.mem.pool_nonpaged_bytes", v.PoolNonpagedBytes, tags, metadata.Gauge, metadata.Bytes, descWinProcMemPool_nonpaged_bytes)
   169  		Add(&md, "win.proc.mem.pool_paged_bytes", v.PoolPagedBytes, tags, metadata.Gauge, metadata.Bytes, descWinProcMemPool_paged_bytes)
   170  		Add(&md, "win.proc.mem.vm.bytes", v.VirtualBytes, tags, metadata.Gauge, metadata.Bytes, descWinProcMemVmBytes)
   171  		totalVirtualMemByName[name] += v.VirtualBytes
   172  		Add(&md, "win.proc.mem.vm.bytes_peak", v.VirtualBytesPeak, tags, metadata.Gauge, metadata.Bytes, descWinProcMemVmBytes_peak)
   173  		Add(&md, "win.proc.mem.working_set", v.WorkingSet, tags, metadata.Gauge, metadata.Bytes, descWinProcMemWorking_set)
   174  		Add(&md, "win.proc.mem.working_set_peak", v.WorkingSetPeak, tags, metadata.Gauge, metadata.Bytes, descWinProcMemWorking_set_peak)
   175  		Add(&md, "win.proc.mem.working_set_private", v.WorkingSetPrivate, tags, metadata.Gauge, metadata.Bytes, descWinProcMemWorking_set_private)
   176  		totalPrivateWSMemByName[name] += v.WorkingSetPrivate
   177  		Add(&md, "win.proc.priority_base", v.PriorityBase, tags, metadata.Gauge, metadata.None, descWinProcPriority_base)
   178  		Add(&md, "win.proc.private_bytes", v.PrivateBytes, tags, metadata.Gauge, metadata.Bytes, descWinProcPrivate_bytes)
   179  		Add(&md, "win.proc.thread_count", v.ThreadCount, tags, metadata.Gauge, metadata.Count, descWinProcthread_count)
   180  		Add(&md, "win.proc.pid", v.IDProcess, tags, metadata.Gauge, metadata.Unit("PID"), osProcPID)
   181  		countByName[name]++
   182  	}
   183  	for name, count := range countByName {
   184  		if count < 1 {
   185  			continue
   186  		}
   187  		Add(&md, osProcCount, count, opentsdb.TagSet{"name": name}, metadata.Gauge, metadata.Process, osProcCountDesc)
   188  		if totalCPU, ok := totalCPUByName[name]; ok {
   189  			Add(&md, osProcCPU, totalCPU, opentsdb.TagSet{"name": name}, metadata.Counter, metadata.Pct, osProcCPUDesc)
   190  		}
   191  		if totalVM, ok := totalVirtualMemByName[name]; ok {
   192  			Add(&md, osProcMemVirtual, totalVM, opentsdb.TagSet{"name": name}, metadata.Gauge, metadata.Bytes, osProcMemVirtualDesc)
   193  		}
   194  		if totalPWS, ok := totalPrivateWSMemByName[name]; ok {
   195  			Add(&md, osProcMemReal, totalPWS, opentsdb.TagSet{"name": name}, metadata.Gauge, metadata.Bytes, osProcMemRealDesc)
   196  		}
   197  	}
   198  	return md, nil
   199  }
   200  
   201  // Divide CPU by 1e5 because: 1 seconds / 100 Nanoseconds = 1e7. This is the
   202  // percent time as a decimal, so divide by two less zeros to make it the same as
   203  // the result * 100.
   204  const NS100_Seconds = 1e5
   205  
   206  const (
   207  	descWinProcCPU_priv               = "Percentage of elapsed time that this thread has spent executing code in privileged mode."
   208  	descWinProcCPU_total              = "Percentage of elapsed time that this process's threads have spent executing code in user or privileged mode."
   209  	descWinProcCPU_user               = "Percentage of elapsed time that this process's threads have spent executing code in user mode."
   210  	descWinProcElapsed_time           = "Elapsed time in seconds this process has been running."
   211  	descWinProcHandle_count           = "Total number of handles the process has open across all threads."
   212  	descWinProcIo_bytes_other         = "Rate at which the process is issuing bytes to I/O operations that do not involve data such as control operations."
   213  	descWinProcIo_bytes_read          = "Rate at which the process is reading bytes from I/O operations."
   214  	descWinProcIo_bytes_write         = "Rate at which the process is writing bytes to I/O operations."
   215  	descWinProcIo_operations          = "Rate at which the process is issuing I/O operations that are neither a read or a write request."
   216  	descWinProcIo_operations_read     = "Rate at which the process is issuing read I/O operations."
   217  	descWinProcIo_operations_write    = "Rate at which the process is issuing write I/O operations."
   218  	descWinProcMemPage_faults         = "Rate of page faults by the threads executing in this process."
   219  	descWinProcMemPagefile_bytes      = "Current number of bytes this process has used in the paging file(s)."
   220  	descWinProcMemPagefile_bytes_peak = "Maximum number of bytes this process has used in the paging file(s)."
   221  	descWinProcMemPool_nonpaged_bytes = "Total number of bytes for objects that cannot be written to disk when they are not being used."
   222  	descWinProcMemPool_paged_bytes    = "Total number of bytes for objects that can be written to disk when they are not being used."
   223  	descWinProcMemVmBytes             = "Current size, in bytes, of the virtual address space that the process is using."
   224  	descWinProcMemVmBytes_peak        = "Maximum number of bytes of virtual address space that the process has used at any one time."
   225  	descWinProcMemWorking_set         = "Current number of bytes in the working set of this process at any point in time."
   226  	descWinProcMemWorking_set_peak    = "Maximum number of bytes in the working set of this process at any point in time."
   227  	descWinProcMemWorking_set_private = "Current number of bytes in the working set that are not shared with other processes."
   228  	descWinProcPriority_base          = "Current base priority of this process. Threads within a process can raise and lower their own base priority relative to the process base priority of the process."
   229  	descWinProcPrivate_bytes          = "Current number of bytes this process has allocated that cannot be shared with other processes."
   230  	descWinProcthread_count           = "Number of threads currently active in this process."
   231  )
   232  
   233  // Actually a CIM_StatisticalInformation.
   234  type Win32_PerfRawData_PerfProc_Process struct {
   235  	ElapsedTime             uint64
   236  	Frequency_Object        uint64
   237  	HandleCount             uint32
   238  	IDProcess               uint32
   239  	IOOtherBytesPersec      uint64
   240  	IOOtherOperationsPersec uint64
   241  	IOReadBytesPersec       uint64
   242  	IOReadOperationsPersec  uint64
   243  	IOWriteBytesPersec      uint64
   244  	IOWriteOperationsPersec uint64
   245  	Name                    string
   246  	PageFaultsPersec        uint32
   247  	PageFileBytes           uint64
   248  	PageFileBytesPeak       uint64
   249  	PercentPrivilegedTime   uint64
   250  	PercentProcessorTime    uint64
   251  	PercentUserTime         uint64
   252  	PoolNonpagedBytes       uint32
   253  	PoolPagedBytes          uint32
   254  	PriorityBase            uint32
   255  	PrivateBytes            uint64
   256  	ThreadCount             uint32
   257  	Timestamp_Object        uint64
   258  	Timestamp_Sys100NS      uint64
   259  	VirtualBytes            uint64
   260  	VirtualBytesPeak        uint64
   261  	WorkingSet              uint64
   262  	WorkingSetPeak          uint64
   263  	WorkingSetPrivate       uint64
   264  }
   265  
   266  const (
   267  	descWinServiceCheckPoint = "The CheckPoint property specifies a value that the service increments periodically to report its progress during a lengthy start, stop, pause, or continue operation. For example, the service should increment this value as it completes each step of its initialization when it is starting up. The user interface program that invoked the operation on the service uses this value to track the progress of the service during a lengthy operation. This value is not valid and should be zero when the service does not have a start, stop, pause, or continue operation pending."
   268  	descWinServiceStarted    = "Started is a boolean indicating whether the service has been started (TRUE), or stopped (FALSE)."
   269  	descWinServiceStatus     = "The Status property indicates the current status of the object. Right now 0=OK and 1=Not OK, but various operational and non-operational statuses can be defined such as OK, Degraded,  Pred Fail, Error, Starting, Stopping, and Service."
   270  	descWinServiceWaitHint   = "The WaitHint property specifies the estimated time required (in milliseconds) for a pending start, stop, pause, or continue operation. After the specified amount of time has elapsed, the service makes its next call to the SetServiceStatus function with either an incremented CheckPoint value or a change in Current State. If the amount of time specified by WaitHint passes, and CheckPoint has not been incremented, or the Current State has not changed, the service control manager or service control program assumes that an error has occurred."
   271  )
   272  
   273  // Actually a Win32_BaseServce.
   274  type Win32_Service struct {
   275  	CheckPoint uint32
   276  	Name       string
   277  	ProcessId  uint32
   278  	Started    bool
   279  	Status     string
   280  	WaitHint   uint32
   281  	StartMode  string
   282  }
   283  
   284  type WorkerProcess struct {
   285  	AppPoolName string
   286  	ProcessId   uint32
   287  }