github.com/dylandreimerink/gobpfld@v0.6.1-0.20220205171531-e79c330ad608/internal/syscall/perf.go (about)

     1  package syscall
     2  
     3  import (
     4  	"syscall"
     5  	"unsafe"
     6  
     7  	"golang.org/x/sys/unix"
     8  )
     9  
    10  // PerfEventAttr is the go version of the perf_event_attr struct as defined by the kernel.
    11  // https://elixir.bootlin.com/linux/v5.14.14/source/include/uapi/linux/perf_event.h#L338
    12  type PerfEventAttr struct {
    13  	Type   PerfType
    14  	Size   uint32
    15  	Config uint64
    16  	// union of sample_period and sample_frequency
    17  	SamplePeriodFreq uint64
    18  	SampleType       uint64
    19  	AttrFlags        PerfAttrFlags
    20  	// union of wakeup_events and wakeup_watermark
    21  	WakeupEventsWatermark uint32
    22  	BPType                uint32
    23  	// union of bp_addr, kprobe_func, uprobe_path, and config1
    24  	BPAddr uintptr
    25  	// union of bp_len, kprobe_addr, probe_offset, and config2
    26  	BPLen uint64
    27  	// Unum of perf_branch_sample_type
    28  	BranchSampleType uint64
    29  	// Defines set of user regs to dump on samples.
    30  	// See asm/perf_regs.h for details.
    31  	SampleRegsUser uint64
    32  	// Defines size of the user stack to dump on samples.
    33  	SampleStackUser uint32
    34  	ClockID         int32
    35  	// Defines set of regs to dump for each sample
    36  	// state captured on:
    37  	//  - precise = 0: PMU interrupt
    38  	//  - precise > 0: sampled instruction
    39  	//
    40  	// See asm/perf_regs.h for details.
    41  	SampleRegsIntr uint64
    42  	// Wakeup watermark for AUX area
    43  	AUXWatermark   uint32
    44  	SampleMaxStack uint16
    45  	// __reserved_2
    46  	_             uint16
    47  	AUXSampleSize uint32
    48  	// __reserved_3
    49  	_       uint32
    50  	SigData uint64
    51  }
    52  
    53  const AttrSize = uint32(unsafe.Sizeof(PerfEventAttr{}))
    54  
    55  // PerfType https://elixir.bootlin.com/linux/latest/source/include/uapi/linux/perf_event.h#L32
    56  type PerfType uint32
    57  
    58  const (
    59  	// PERF_TYPE_HARDWARE This indicates one of the "generalized"  hardware  events
    60  	// provided  by the kernel.  See the config field definition
    61  	// for more details.
    62  	PERF_TYPE_HARDWARE PerfType = iota
    63  
    64  	// PERF_TYPE_SOFTWARE This indicates one of the  software-defined  events  provided
    65  	// by  the  kernel  (even  if  no hardware support is
    66  	// available).
    67  	PERF_TYPE_SOFTWARE
    68  
    69  	// PERF_TYPE_TRACEPOINT This indicates a tracepoint provided by the kernel tracepoint infrastructure.
    70  	PERF_TYPE_TRACEPOINT
    71  
    72  	// PERF_TYPE_HW_CACHE  This  indicates  a hardware cache event. This has a special encoding,
    73  	// described in the config field definition.
    74  	PERF_TYPE_HW_CACHE
    75  
    76  	// PERF_TYPE_RAW This indicates a "raw" implementation-specific  event  in
    77  	// the config field.
    78  	PERF_TYPE_RAW
    79  
    80  	// PERF_TYPE_BREAKPOINT This  indicates  a hardware breakpoint as provided by the CPU.
    81  	// Breakpoints can be read/write accesses  to  an  address as well as execution of an instruction address.
    82  	PERF_TYPE_BREAKPOINT
    83  )
    84  
    85  // PerfAttrFlags are used to pass a lot of boolean flags efficiently to the kerenl
    86  type PerfAttrFlags uint64
    87  
    88  const (
    89  	// PerfAttrFlagsDisabled off by default
    90  	PerfAttrFlagsDisabled PerfAttrFlags = 1 << iota
    91  	// PerfAttrFlagsInherit children inherit it
    92  	PerfAttrFlagsInherit
    93  	// PerfAttrFlagsPinned must always be on PMU
    94  	PerfAttrFlagsPinned
    95  	// PerfAttrFlagsExclusive only group on PMU
    96  	PerfAttrFlagsExclusive
    97  	// PerfAttrFlagsExcludeUser don't count user
    98  	PerfAttrFlagsExcludeUser
    99  	// PerfAttrFlagsExcludeKernel ditto kernel
   100  	PerfAttrFlagsExcludeKernel
   101  	// PerfAttrFlagsExcludeHV ditto hypervisor
   102  	PerfAttrFlagsExcludeHV
   103  	// PerfAttrFlagsExcludeIdle don't count when idle
   104  	PerfAttrFlagsExcludeIdle
   105  	// PerfAttrFlagsMmap include mmap data
   106  	PerfAttrFlagsMmap
   107  	// PerfAttrFlagsComm include comm data
   108  	PerfAttrFlagsComm
   109  	// PerfAttrFlagsFreq use freq, not period
   110  	PerfAttrFlagsFreq
   111  	// PerfAttrFlagsInheritStat per task counts
   112  	PerfAttrFlagsInheritStat
   113  	// PerfAttrFlagsEnableOnExec next exec enables
   114  	PerfAttrFlagsEnableOnExec
   115  	// PerfAttrFlagsTask trace fork/exit
   116  	PerfAttrFlagsTask
   117  	// PerfAttrFlagsWatermark wakeup_watermark
   118  	PerfAttrFlagsWatermark
   119  	// PerfAttrFlagsPreciseIPConstantSkid SAMPLE_IP must have constant skid, See also PERF_RECORD_MISC_EXACT_IP
   120  	PerfAttrFlagsPreciseIPConstantSkid PerfAttrFlags = 1 << 15
   121  	// PerfAttrFlagsPreciseIPRequestZeroSkid SAMPLE_IP requested to have 0 skid, See also PERF_RECORD_MISC_EXACT_IP
   122  	PerfAttrFlagsPreciseIPRequestZeroSkid PerfAttrFlags = 1 << 16
   123  	// PerfAttrFlagsPreciseIPRequireZeroSkid SAMPLE_IP must have 0 skid, See also PERF_RECORD_MISC_EXACT_IP
   124  	PerfAttrFlagsPreciseIPRequireZeroSkid PerfAttrFlags = 1<<16 + 1<<15
   125  )
   126  
   127  type PerfEventOpenFlags uintptr
   128  
   129  const (
   130  	// PerfEventOpenFDNoGroup This  flag  tells the event to ignore the group_fd parameter ex‐
   131  	// cept for the purpose of setting up output redirection using  the
   132  	// PERF_FLAG_FD_OUTPUT flag.
   133  	PerfEventOpenFDNoGroup PerfEventOpenFlags = 1 << iota
   134  
   135  	// PerfEventOpenFDOutput This flag re-routes the event's sampled output to instead be in‐
   136  	// cluded in the mmap buffer of the event specified by group_fd.
   137  	PerfEventOpenFDOutput
   138  
   139  	// PerfEventOpenPIDCgroup This  flag  tells the event to ignore the group_fd parameter ex‐
   140  	// cept for the purpose of setting up output redirection using  the
   141  	// PERF_FLAG_FD_OUTPUT flag.
   142  	PerfEventOpenPIDCgroup
   143  
   144  	// PerfEventOpenFDCloseOnExit This  flag  enables the close-on-exec flag for the created event
   145  	// file descriptor, so that the file  descriptor  is  automatically
   146  	// closed  on  execve(2).   Setting the close-on-exec flags at cre‐
   147  	// ation time, rather than later with  fcntl(2),  avoids  potential
   148  	// race    conditions    where    the    calling   thread   invokes
   149  	// perf_event_open() and fcntl(2)  at  the  same  time  as  another
   150  	// thread calls fork(2) then execve(2).
   151  	PerfEventOpenFDCloseOnExit
   152  )
   153  
   154  // PerfEventOpen is a wrapper around the perf_event_open syscall.
   155  func PerfEventOpen(attr PerfEventAttr, pid, cpu, groupFD int, flags PerfEventOpenFlags) (uintptr, error) {
   156  	fd, _, errno := unix.Syscall6(
   157  		unix.SYS_PERF_EVENT_OPEN,
   158  		uintptr(unsafe.Pointer(&attr)),
   159  		uintptr(pid),
   160  		uintptr(cpu),
   161  		uintptr(groupFD),
   162  		uintptr(flags),
   163  		0,
   164  	)
   165  	if errno != 0 {
   166  		return 0, &Error{
   167  			Errno: errno,
   168  			Err:   perfEventOpenErrors[errno],
   169  		}
   170  	}
   171  
   172  	return fd, nil
   173  }
   174  
   175  var perfEventOpenErrors = map[syscall.Errno]string{
   176  	unix.E2BIG: "The perf_event_attr size value is too small (smaller " +
   177  		"than PERF_ATTR_SIZE_VER0), too big (larger than the page  size), " +
   178  		"or  larger  than the kernel supports and the extra bytes are not " +
   179  		"zero.  When E2BIG is returned, the perf_event_attr size field is " +
   180  		"overwritten by the kernel to be the size of the structure it was " +
   181  		"expecting.",
   182  
   183  	unix.EACCES: "The requested event  requires  CAP_PERFMON  (since " +
   184  		"Linux  5.8)  or  CAP_SYS_ADMIN permissions (or a more permissive " +
   185  		"perf_event paranoid setting).  Some common cases  where  an  un‐ " +
   186  		"privileged  process  may  encounter  this  error: attaching to a " +
   187  		"process owned by a different user; monitoring all processes on a " +
   188  		"given  CPU  (i.e.,  specifying  the pid argument as -1); and not " +
   189  		"setting exclude_kernel when the paranoid setting requires it.",
   190  
   191  	unix.EBADF: "The group_fd file descriptor is not  valid,  or,  if " +
   192  		"PERF_FLAG_PID_CGROUP  is  set, the cgroup file descriptor in pid " +
   193  		"is not valid.",
   194  
   195  	unix.EBUSY: "Another event already has exclusive  access  to  the PMU.",
   196  
   197  	unix.EFAULT: "The  attr  pointer points at an invalid memory address.",
   198  
   199  	unix.EINVAL: "The specified event is invalid.  There are many pos‐ " +
   200  		"sible  reasons  for this.  A not-exhaustive list: sample_freq is " +
   201  		"higher than the maximum setting; the cpu to monitor does not ex‐ " +
   202  		"ist;  read_format  is out of range; sample_type is out of range; " +
   203  		"the flags value is out of range; exclusive or pinned set and the " +
   204  		"event  is not a group leader; the event config values are out of " +
   205  		"range or set reserved bits; the generic event  selected  is  not " +
   206  		"supported;  or  there  is  not  enough  room to add the selected " +
   207  		"event.",
   208  
   209  	unix.EINTR: "Returned when trying to mix perf and ftrace handling for  a  uprobe.",
   210  
   211  	unix.EMFILE: "Each  opened  event uses one file descriptor.  If a large number " +
   212  		"of events are opened, the per-process limit  on  the  number  of " +
   213  		"open file descriptors will be reached, and no more events can be " +
   214  		"created.",
   215  
   216  	unix.ENODEV: "Returned when the event involves a feature not supported by the current CPU.",
   217  
   218  	unix.ENOENT: "Returned  if  the type setting is not valid. " +
   219  		"This error is also returned for some unsupported generic events.",
   220  
   221  	unix.ENOSPC: "Prior to Linux 3.3, if there was not enough room for the  event, " +
   222  		"ENOSPC  was returned.  In Linux 3.3, this was changed to EINVAL. " +
   223  		"ENOSPC is still returned if  you  try  to  add  more  breakpoint " +
   224  		"events than supported by the hardware.",
   225  
   226  	unix.ENOSYS: "Returned  if PERF_SAMPLE_STACK_USER is set in sample_type and it " +
   227  		"is not supported by hardware.",
   228  
   229  	unix.EOPNOTSUPP: "Returned if an event requiring a specific  hardware  feature  is " +
   230  		"requested  but  there is no hardware support.  This includes re‐ " +
   231  		"questing low-skid events if not supported, branch tracing if  it " +
   232  		"is not available, sampling if no PMU interrupt is available, and " +
   233  		"branch stacks for software events.",
   234  
   235  	unix.EOVERFLOW: "(since Linux 4.8) " +
   236  		"Returned  if  PERF_SAMPLE_CALLCHAIN  is   requested   and   sam‐ " +
   237  		"ple_max_stack   is   larger   than   the  maximum  specified  in " +
   238  		"/proc/sys/kernel/perf_event_max_stack.",
   239  
   240  	unix.EPERM: "Returned on many (but not all) architectures when an unsupported " +
   241  		"exclude_hv,  exclude_idle,  exclude_user, or exclude_kernel set‐ " +
   242  		"ting is specified. \n" +
   243  		"It can also happen, as with EACCES, when the requested event re‐ " +
   244  		"quires  CAP_PERFMON  (since  Linux 5.8) or CAP_SYS_ADMIN permis‐ " +
   245  		"sions (or a more permissive perf_event paranoid setting).   This " +
   246  		"includes  setting  a  breakpoint on a kernel address, and (since " +
   247  		"Linux 3.13) setting a kernel function-trace tracepoint.",
   248  
   249  	unix.ESRCH: "Returned if attempting to attach to a process that does not exist.",
   250  }