gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/kernel/syscalls.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  import (
    18  	"fmt"
    19  	"strconv"
    20  
    21  	"google.golang.org/protobuf/proto"
    22  	"gvisor.dev/gvisor/pkg/abi"
    23  	"gvisor.dev/gvisor/pkg/abi/sentry"
    24  	"gvisor.dev/gvisor/pkg/atomicbitops"
    25  	"gvisor.dev/gvisor/pkg/bits"
    26  	"gvisor.dev/gvisor/pkg/hostarch"
    27  	"gvisor.dev/gvisor/pkg/metric"
    28  	"gvisor.dev/gvisor/pkg/sentry/arch"
    29  	"gvisor.dev/gvisor/pkg/sentry/seccheck"
    30  	pb "gvisor.dev/gvisor/pkg/sentry/seccheck/points/points_go_proto"
    31  	"gvisor.dev/gvisor/pkg/sync"
    32  )
    33  
    34  // outOfRangeSyscallNumber is used to represent a syscall number that is out of the
    35  // range [0, maxSyscallNum] in monitoring.
    36  var outOfRangeSyscallNumber = []*metric.FieldValue{&metric.FieldValue{"-1"}}
    37  
    38  // SyscallSupportLevel is a syscall support levels.
    39  type SyscallSupportLevel int
    40  
    41  // String returns a human readable representation of the support level.
    42  func (l SyscallSupportLevel) String() string {
    43  	switch l {
    44  	case SupportUnimplemented:
    45  		return "Unimplemented"
    46  	case SupportPartial:
    47  		return "Partial Support"
    48  	case SupportFull:
    49  		return "Full Support"
    50  	default:
    51  		return "Undocumented"
    52  	}
    53  }
    54  
    55  const (
    56  	// SupportUndocumented indicates the syscall is not documented yet.
    57  	SupportUndocumented = iota
    58  
    59  	// SupportUnimplemented indicates the syscall is unimplemented.
    60  	SupportUnimplemented
    61  
    62  	// SupportPartial indicates the syscall is partially supported.
    63  	SupportPartial
    64  
    65  	// SupportFull indicates the syscall is fully supported.
    66  	SupportFull
    67  )
    68  
    69  // Syscall includes the syscall implementation and compatibility information.
    70  type Syscall struct {
    71  	// Name is the syscall name.
    72  	Name string
    73  	// Fn is the implementation of the syscall.
    74  	Fn SyscallFn
    75  	// SupportLevel is the level of support implemented in gVisor.
    76  	SupportLevel SyscallSupportLevel
    77  	// Note describes the compatibility of the syscall.
    78  	Note string
    79  	// URLs is set of URLs to any relevant bugs or issues.
    80  	URLs []string
    81  	// PointCallback is an optional callback that converts syscall arguments
    82  	// to a proto that can be used with seccheck.Sink.
    83  	// Callback functions must follow this naming convention:
    84  	//   PointSyscallNameInCamelCase, e.g. PointReadat, PointRtSigaction.
    85  	PointCallback SyscallToProto
    86  }
    87  
    88  // SyscallFn is a syscall implementation.
    89  type SyscallFn func(t *Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *SyscallControl, error)
    90  
    91  // MissingFn is a syscall to be called when an implementation is missing.
    92  type MissingFn func(t *Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error)
    93  
    94  // Possible flags for SyscallFlagsTable.enable.
    95  const (
    96  	// syscallPresent indicates that this is not a missing syscall.
    97  	//
    98  	// This flag is used internally in SyscallFlagsTable.
    99  	syscallPresent = 1 << iota
   100  
   101  	// StraceEnableLog enables syscall log tracing.
   102  	StraceEnableLog
   103  
   104  	// StraceEnableEvent enables syscall event tracing.
   105  	StraceEnableEvent
   106  
   107  	// ExternalBeforeEnable enables the external hook before syscall execution.
   108  	ExternalBeforeEnable
   109  
   110  	// ExternalAfterEnable enables the external hook after syscall execution.
   111  	ExternalAfterEnable
   112  
   113  	// SecCheckEnter represents a schematized/enter syscall seccheck event.
   114  	SecCheckEnter
   115  
   116  	// SecCheckExit represents a schematized/exit syscall seccheck event.
   117  	SecCheckExit
   118  
   119  	// SecCheckRawEnter represents raw/enter syscall seccheck event.
   120  	SecCheckRawEnter
   121  
   122  	// SecCheckRawExit represents raw/exit syscall seccheck event.
   123  	SecCheckRawExit
   124  )
   125  
   126  // StraceEnableBits combines both strace log and event flags.
   127  const StraceEnableBits = StraceEnableLog | StraceEnableEvent
   128  
   129  // SyscallFlagsTable manages a set of enable/disable bit fields on a per-syscall
   130  // basis.
   131  type SyscallFlagsTable struct {
   132  	// mu protects writes to the fields below.
   133  	//
   134  	// Atomic loads are always allowed. Atomic stores are allowed only
   135  	// while mu is held.
   136  	mu sync.Mutex
   137  
   138  	// enable contains the enable bits for each syscall.
   139  	//
   140  	// missing syscalls have the same value in enable as missingEnable to
   141  	// avoid an extra branch in Word.
   142  	enable [sentry.MaxSyscallNum + 1]atomicbitops.Uint32
   143  
   144  	// missingEnable contains the enable bits for missing syscalls.
   145  	missingEnable atomicbitops.Uint32
   146  }
   147  
   148  // Init initializes the struct, with all syscalls in table set to enable.
   149  //
   150  // max is the largest syscall number in table.
   151  func (e *SyscallFlagsTable) init(table map[uintptr]Syscall) {
   152  	for num := range table {
   153  		enableFlags := uint32(syscallPresent)
   154  		e.enable[num] = atomicbitops.FromUint32(enableFlags)
   155  	}
   156  	seccheck.Global.AddSyscallFlagListener(e)
   157  	e.UpdateSecCheck(&seccheck.Global)
   158  }
   159  
   160  // UpdateSecCheck implements seccheck.SyscallFlagListener.
   161  //
   162  // It is called when per-syscall seccheck event enablement changes.
   163  func (e *SyscallFlagsTable) UpdateSecCheck(state *seccheck.State) {
   164  	e.mu.Lock()
   165  	defer e.mu.Unlock()
   166  	for sysno := uintptr(0); sysno <= sentry.MaxSyscallNum; sysno++ {
   167  		oldFlags := e.enable[sysno].Load()
   168  		if !bits.IsOn32(oldFlags, syscallPresent) {
   169  			continue
   170  		}
   171  		flags := oldFlags
   172  		if state.SyscallEnabled(seccheck.SyscallEnter, sysno) {
   173  			flags |= SecCheckEnter
   174  		} else {
   175  			flags &^= SecCheckEnter
   176  		}
   177  		if state.SyscallEnabled(seccheck.SyscallExit, sysno) {
   178  			flags |= SecCheckExit
   179  		} else {
   180  			flags &^= SecCheckExit
   181  		}
   182  		if state.SyscallEnabled(seccheck.SyscallRawEnter, sysno) {
   183  			flags |= SecCheckRawEnter
   184  		} else {
   185  			flags &^= SecCheckRawEnter
   186  		}
   187  		if state.SyscallEnabled(seccheck.SyscallRawExit, sysno) {
   188  			flags |= SecCheckRawExit
   189  		} else {
   190  			flags &^= SecCheckRawExit
   191  		}
   192  		if flags != oldFlags {
   193  			e.enable[sysno].Store(flags)
   194  		}
   195  	}
   196  }
   197  
   198  // Word returns the enable bitfield for sysno.
   199  func (e *SyscallFlagsTable) Word(sysno uintptr) uint32 {
   200  	if sysno <= sentry.MaxSyscallNum {
   201  		return e.enable[sysno].Load()
   202  	}
   203  	return e.missingEnable.Load()
   204  }
   205  
   206  // Enable sets enable bit `bit` for all syscalls based on s.
   207  //
   208  // Syscalls missing from `s` are disabled.
   209  //
   210  // Syscalls missing from the initial table passed to Init cannot be added as
   211  // individual syscalls. If present in s they will be ignored.
   212  //
   213  // Callers to Word may see either the old or new value while this function
   214  // is executing.
   215  func (e *SyscallFlagsTable) Enable(bit uint32, s map[uintptr]bool, missingEnable bool) {
   216  	e.mu.Lock()
   217  	defer e.mu.Unlock()
   218  
   219  	missingVal := e.missingEnable.Load()
   220  	if missingEnable {
   221  		missingVal |= bit
   222  	} else {
   223  		missingVal &^= bit
   224  	}
   225  	e.missingEnable.Store(missingVal)
   226  
   227  	for num := range e.enable {
   228  		val := e.enable[num].Load()
   229  		if !bits.IsOn32(val, syscallPresent) {
   230  			// Missing.
   231  			e.enable[num].Store(missingVal)
   232  			continue
   233  		}
   234  
   235  		if s[uintptr(num)] {
   236  			val |= bit
   237  		} else {
   238  			val &^= bit
   239  		}
   240  		e.enable[num].Store(val)
   241  	}
   242  }
   243  
   244  // EnableAll sets enable bit bit for all syscalls, present and missing.
   245  func (e *SyscallFlagsTable) EnableAll(bit uint32) {
   246  	e.mu.Lock()
   247  	defer e.mu.Unlock()
   248  
   249  	missingVal := e.missingEnable.Load()
   250  	missingVal |= bit
   251  	e.missingEnable.Store(missingVal)
   252  
   253  	for num := range e.enable {
   254  		val := e.enable[num].Load()
   255  		if !bits.IsOn32(val, syscallPresent) {
   256  			// Missing.
   257  			e.enable[num].Store(missingVal)
   258  			continue
   259  		}
   260  
   261  		val |= bit
   262  		e.enable[num].Store(val)
   263  	}
   264  }
   265  
   266  // Stracer traces syscall execution.
   267  type Stracer interface {
   268  	// SyscallEnter is called on syscall entry.
   269  	//
   270  	// The returned private data is passed to SyscallExit.
   271  	SyscallEnter(t *Task, sysno uintptr, args arch.SyscallArguments, flags uint32) any
   272  
   273  	// SyscallExit is called on syscall exit.
   274  	SyscallExit(context any, t *Task, sysno, rval uintptr, err error)
   275  }
   276  
   277  // SyscallTable is a lookup table of system calls.
   278  //
   279  // Note that a SyscallTable is not savable directly. Instead, they are saved as
   280  // an OS/Arch pair and lookup happens again on restore.
   281  type SyscallTable struct {
   282  	// OS is the operating system that this syscall table implements.
   283  	OS abi.OS
   284  
   285  	// Arch is the architecture that this syscall table targets.
   286  	Arch arch.Arch
   287  
   288  	// The OS version that this syscall table implements.
   289  	Version Version
   290  
   291  	// AuditNumber is a numeric constant that represents the syscall table. If
   292  	// non-zero, auditNumber must be one of the AUDIT_ARCH_* values defined by
   293  	// linux/audit.h.
   294  	AuditNumber uint32
   295  
   296  	// Table is the collection of functions.
   297  	Table map[uintptr]Syscall
   298  
   299  	// lookup is a fixed-size array that holds the syscalls (indexed by
   300  	// their numbers). It is used for fast look ups.
   301  	lookup [sentry.MaxSyscallNum + 1]SyscallFn
   302  
   303  	// pointCallbacks is a fixed-size array that holds SyscallToProto callbacks
   304  	// (indexed by syscall numbers). It is used for fast lookups when
   305  	// seccheck.Point is enabled for the syscall.
   306  	pointCallbacks [sentry.MaxSyscallNum + 1]SyscallToProto
   307  
   308  	// Emulate is a collection of instruction addresses to emulate. The
   309  	// keys are addresses, and the values are system call numbers.
   310  	Emulate map[hostarch.Addr]uintptr
   311  
   312  	// The function to call in case of a missing system call.
   313  	Missing MissingFn
   314  
   315  	// Stracer traces this syscall table.
   316  	Stracer Stracer
   317  
   318  	// External is used to handle an external callback.
   319  	External func(*Kernel)
   320  
   321  	// ExternalFilterBefore is called before External is called before the syscall is executed.
   322  	// External is not called if it returns false.
   323  	ExternalFilterBefore func(*Task, uintptr, arch.SyscallArguments) bool
   324  
   325  	// ExternalFilterAfter is called before External is called after the syscall is executed.
   326  	// External is not called if it returns false.
   327  	ExternalFilterAfter func(*Task, uintptr, arch.SyscallArguments) bool
   328  
   329  	// FeatureEnable stores the strace and one-shot enable bits.
   330  	FeatureEnable SyscallFlagsTable
   331  }
   332  
   333  // MaxSysno returns the largest system call number.
   334  func (s *SyscallTable) MaxSysno() (max uintptr) {
   335  	for num := range s.Table {
   336  		if num > max {
   337  			max = num
   338  		}
   339  	}
   340  	return max
   341  }
   342  
   343  // allSyscallTables contains all known tables.
   344  var allSyscallTables []*SyscallTable
   345  
   346  var (
   347  	// unimplementedSyscallCounterInit ensures the following fields are only initialized once.
   348  	unimplementedSyscallCounterInit sync.Once
   349  
   350  	// unimplementedSyscallNumbers maps syscall numbers to their string representation.
   351  	// Used such that incrementing unimplementedSyscallCounter does not require allocating memory.
   352  	// Each element in the mapped slices are of length 1, as there is only one field for the
   353  	// unimplemented syscall counter metric. Allocating a slice is necessary as it is passed as a
   354  	// variadic argument to the metric library.
   355  	unimplementedSyscallNumbers map[uintptr][]*metric.FieldValue
   356  
   357  	// unimplementedSyscallCounter tracks the number of times each unimplemented syscall has been
   358  	// called by the sandboxed application.
   359  	unimplementedSyscallCounter *metric.Uint64Metric
   360  )
   361  
   362  // SyscallTables returns a read-only slice of registered SyscallTables.
   363  func SyscallTables() []*SyscallTable {
   364  	return allSyscallTables
   365  }
   366  
   367  // LookupSyscallTable returns the SyscallCall table for the OS/Arch combination.
   368  func LookupSyscallTable(os abi.OS, a arch.Arch) (*SyscallTable, bool) {
   369  	for _, s := range allSyscallTables {
   370  		if s.OS == os && s.Arch == a {
   371  			return s, true
   372  		}
   373  	}
   374  	return nil, false
   375  }
   376  
   377  // RegisterSyscallTable registers a new syscall table for use by a Kernel.
   378  func RegisterSyscallTable(s *SyscallTable) {
   379  	if max := s.MaxSysno(); max > sentry.MaxSyscallNum {
   380  		panic(fmt.Sprintf("SyscallTable %+v contains too large syscall number %d", s, max))
   381  	}
   382  	if _, ok := LookupSyscallTable(s.OS, s.Arch); ok {
   383  		panic(fmt.Sprintf("Duplicate SyscallTable registered for OS %v Arch %v", s.OS, s.Arch))
   384  	}
   385  	allSyscallTables = append(allSyscallTables, s)
   386  	unimplementedSyscallCounterInit.Do(func() {
   387  		allowedValues := make([]*metric.FieldValue, sentry.MaxSyscallNum+2)
   388  		unimplementedSyscallNumbers = make(map[uintptr][]*metric.FieldValue, len(allowedValues))
   389  		for i := uintptr(0); i <= sentry.MaxSyscallNum; i++ {
   390  			s := &metric.FieldValue{strconv.Itoa(int(i))}
   391  			allowedValues[i] = s
   392  			unimplementedSyscallNumbers[i] = []*metric.FieldValue{s}
   393  		}
   394  		allowedValues[len(allowedValues)-1] = outOfRangeSyscallNumber[0]
   395  		unimplementedSyscallCounter = metric.MustCreateNewUint64Metric("/unimplemented_syscalls", true, "Number of times the application tried to call an unimplemented syscall, broken down by syscall number", metric.NewField("sysno", allowedValues...))
   396  	})
   397  	s.Init()
   398  }
   399  
   400  // Init initializes the system call table.
   401  //
   402  // This should normally be called only during registration.
   403  func (s *SyscallTable) Init() {
   404  	if s.Table == nil {
   405  		// Ensure non-nil lookup table.
   406  		s.Table = make(map[uintptr]Syscall)
   407  	}
   408  	if s.Emulate == nil {
   409  		// Ensure non-nil emulate table.
   410  		s.Emulate = make(map[hostarch.Addr]uintptr)
   411  	}
   412  
   413  	// Initialize the fast-lookup tables.
   414  	for num, sc := range s.Table {
   415  		s.lookup[num] = sc.Fn
   416  	}
   417  	for num, sc := range s.Table {
   418  		s.pointCallbacks[num] = sc.PointCallback
   419  	}
   420  
   421  	// Initialize all features.
   422  	s.FeatureEnable.init(s.Table)
   423  }
   424  
   425  // Lookup returns the syscall implementation, if one exists.
   426  func (s *SyscallTable) Lookup(sysno uintptr) SyscallFn {
   427  	if sysno <= sentry.MaxSyscallNum {
   428  		return s.lookup[sysno]
   429  	}
   430  	return nil
   431  }
   432  
   433  // LookupName looks up a syscall name.
   434  func (s *SyscallTable) LookupName(sysno uintptr) string {
   435  	if sc, ok := s.Table[sysno]; ok {
   436  		return sc.Name
   437  	}
   438  	return fmt.Sprintf("sys_%d", sysno) // Unlikely.
   439  }
   440  
   441  // LookupNo looks up a syscall number by name.
   442  func (s *SyscallTable) LookupNo(name string) (uintptr, error) {
   443  	for i, syscall := range s.Table {
   444  		if syscall.Name == name {
   445  			return uintptr(i), nil
   446  		}
   447  	}
   448  	return 0, fmt.Errorf("syscall %q not found", name)
   449  }
   450  
   451  // LookupEmulate looks up an emulation syscall number.
   452  func (s *SyscallTable) LookupEmulate(addr hostarch.Addr) (uintptr, bool) {
   453  	sysno, ok := s.Emulate[addr]
   454  	return sysno, ok
   455  }
   456  
   457  // mapLookup is similar to Lookup, except that it only uses the syscall table,
   458  // that is, it skips the fast look array. This is available for benchmarking.
   459  func (s *SyscallTable) mapLookup(sysno uintptr) SyscallFn {
   460  	if sc, ok := s.Table[sysno]; ok {
   461  		return sc.Fn
   462  	}
   463  	return nil
   464  }
   465  
   466  // LookupSyscallToProto looks up the SyscallToProto callback for the given
   467  // syscall. It may return nil if none is registered.
   468  func (s *SyscallTable) LookupSyscallToProto(sysno uintptr) SyscallToProto {
   469  	if sysno > sentry.MaxSyscallNum {
   470  		return nil
   471  	}
   472  	return s.pointCallbacks[sysno]
   473  }
   474  
   475  // SyscallToProto is a callback function that converts generic syscall data to
   476  // schematized protobuf for the corresponding syscall.
   477  type SyscallToProto func(*Task, seccheck.FieldSet, *pb.ContextData, SyscallInfo) (proto.Message, pb.MessageType)
   478  
   479  // SyscallInfo provides generic information about the syscall.
   480  type SyscallInfo struct {
   481  	Exit  bool
   482  	Sysno uintptr
   483  	Args  arch.SyscallArguments
   484  	Rval  uintptr
   485  	Errno int
   486  }
   487  
   488  // IncrementUnimplementedSyscallCounter increments the "unimplemented syscall" metric for the given
   489  // syscall number.
   490  // A syscall table must have been initialized prior to calling this function.
   491  // +checkescape:all
   492  //
   493  //go:nosplit
   494  func IncrementUnimplementedSyscallCounter(sysno uintptr) {
   495  	s, found := unimplementedSyscallNumbers[sysno]
   496  	if !found {
   497  		s = outOfRangeSyscallNumber
   498  	}
   499  	unimplementedSyscallCounter.Increment(s...)
   500  }