github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/kernel/syscalls.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  import (
    18  	"fmt"
    19  	"strconv"
    20  
    21  	"github.com/MerlinKodo/gvisor/pkg/abi"
    22  	"github.com/MerlinKodo/gvisor/pkg/atomicbitops"
    23  	"github.com/MerlinKodo/gvisor/pkg/bits"
    24  	"github.com/MerlinKodo/gvisor/pkg/hostarch"
    25  	"github.com/MerlinKodo/gvisor/pkg/metric"
    26  	"github.com/MerlinKodo/gvisor/pkg/sentry/arch"
    27  	"github.com/MerlinKodo/gvisor/pkg/sentry/seccheck"
    28  	pb "github.com/MerlinKodo/gvisor/pkg/sentry/seccheck/points/points_go_proto"
    29  	"github.com/MerlinKodo/gvisor/pkg/sync"
    30  	"google.golang.org/protobuf/proto"
    31  )
    32  
    33  const (
    34  	// maxSyscallNum is the highest supported syscall number.
    35  	//
    36  	// The types below create fast lookup slices for all syscalls. This maximum
    37  	// serves as a sanity check that we don't allocate huge slices for a very large
    38  	// syscall. This is checked during registration.
    39  	// LINT.IfChange
    40  	maxSyscallNum = 2000
    41  	// LINT.ThenChange(../seccheck/syscall.go)
    42  )
    43  
    44  // outOfRangeSyscallNumber is used to represent a syscall number that is out of the
    45  // range [0, maxSyscallNum] in monitoring.
    46  var outOfRangeSyscallNumber = []*metric.FieldValue{&metric.FieldValue{"-1"}}
    47  
    48  // SyscallSupportLevel is a syscall support levels.
    49  type SyscallSupportLevel int
    50  
    51  // String returns a human readable represetation of the support level.
    52  func (l SyscallSupportLevel) String() string {
    53  	switch l {
    54  	case SupportUnimplemented:
    55  		return "Unimplemented"
    56  	case SupportPartial:
    57  		return "Partial Support"
    58  	case SupportFull:
    59  		return "Full Support"
    60  	default:
    61  		return "Undocumented"
    62  	}
    63  }
    64  
    65  const (
    66  	// SupportUndocumented indicates the syscall is not documented yet.
    67  	SupportUndocumented = iota
    68  
    69  	// SupportUnimplemented indicates the syscall is unimplemented.
    70  	SupportUnimplemented
    71  
    72  	// SupportPartial indicates the syscall is partially supported.
    73  	SupportPartial
    74  
    75  	// SupportFull indicates the syscall is fully supported.
    76  	SupportFull
    77  )
    78  
    79  // Syscall includes the syscall implementation and compatibility information.
    80  type Syscall struct {
    81  	// Name is the syscall name.
    82  	Name string
    83  	// Fn is the implementation of the syscall.
    84  	Fn SyscallFn
    85  	// SupportLevel is the level of support implemented in gVisor.
    86  	SupportLevel SyscallSupportLevel
    87  	// Note describes the compatibility of the syscall.
    88  	Note string
    89  	// URLs is set of URLs to any relevant bugs or issues.
    90  	URLs []string
    91  	// PointCallback is an optional callback that converts syscall arguments
    92  	// to a proto that can be used with seccheck.Sink.
    93  	// Callback functions must follow this naming convention:
    94  	//   PointSyscallNameInCamelCase, e.g. PointReadat, PointRtSigaction.
    95  	PointCallback SyscallToProto
    96  }
    97  
    98  // SyscallFn is a syscall implementation.
    99  type SyscallFn func(t *Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *SyscallControl, error)
   100  
   101  // MissingFn is a syscall to be called when an implementation is missing.
   102  type MissingFn func(t *Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error)
   103  
   104  // Possible flags for SyscallFlagsTable.enable.
   105  const (
   106  	// syscallPresent indicates that this is not a missing syscall.
   107  	//
   108  	// This flag is used internally in SyscallFlagsTable.
   109  	syscallPresent = 1 << iota
   110  
   111  	// StraceEnableLog enables syscall log tracing.
   112  	StraceEnableLog
   113  
   114  	// StraceEnableEvent enables syscall event tracing.
   115  	StraceEnableEvent
   116  
   117  	// ExternalBeforeEnable enables the external hook before syscall execution.
   118  	ExternalBeforeEnable
   119  
   120  	// ExternalAfterEnable enables the external hook after syscall execution.
   121  	ExternalAfterEnable
   122  
   123  	// SecCheckEnter represents a schematized/enter syscall seccheck event.
   124  	SecCheckEnter
   125  
   126  	// SecCheckExit represents a schematized/exit syscall seccheck event.
   127  	SecCheckExit
   128  
   129  	// SecCheckRawEnter represents raw/enter syscall seccheck event.
   130  	SecCheckRawEnter
   131  
   132  	// SecCheckRawExit represents raw/exit syscall seccheck event.
   133  	SecCheckRawExit
   134  )
   135  
   136  // StraceEnableBits combines both strace log and event flags.
   137  const StraceEnableBits = StraceEnableLog | StraceEnableEvent
   138  
   139  // SyscallFlagsTable manages a set of enable/disable bit fields on a per-syscall
   140  // basis.
   141  type SyscallFlagsTable struct {
   142  	// mu protects writes to the fields below.
   143  	//
   144  	// Atomic loads are always allowed. Atomic stores are allowed only
   145  	// while mu is held.
   146  	mu sync.Mutex
   147  
   148  	// enable contains the enable bits for each syscall.
   149  	//
   150  	// missing syscalls have the same value in enable as missingEnable to
   151  	// avoid an extra branch in Word.
   152  	enable [maxSyscallNum + 1]atomicbitops.Uint32
   153  
   154  	// missingEnable contains the enable bits for missing syscalls.
   155  	missingEnable atomicbitops.Uint32
   156  }
   157  
   158  // Init initializes the struct, with all syscalls in table set to enable.
   159  //
   160  // max is the largest syscall number in table.
   161  func (e *SyscallFlagsTable) init(table map[uintptr]Syscall) {
   162  	for num := range table {
   163  		enableFlags := uint32(syscallPresent)
   164  		e.enable[num] = atomicbitops.FromUint32(enableFlags)
   165  	}
   166  	seccheck.Global.AddSyscallFlagListener(e)
   167  	e.UpdateSecCheck(&seccheck.Global)
   168  }
   169  
   170  // UpdateSecCheck implements seccheck.SyscallFlagListener.
   171  //
   172  // It is called when per-syscall seccheck event enablement changes.
   173  func (e *SyscallFlagsTable) UpdateSecCheck(state *seccheck.State) {
   174  	e.mu.Lock()
   175  	defer e.mu.Unlock()
   176  	for sysno := uintptr(0); sysno < maxSyscallNum; sysno++ {
   177  		oldFlags := e.enable[sysno].Load()
   178  		if !bits.IsOn32(oldFlags, syscallPresent) {
   179  			continue
   180  		}
   181  		flags := oldFlags
   182  		if state.SyscallEnabled(seccheck.SyscallEnter, sysno) {
   183  			flags |= SecCheckEnter
   184  		} else {
   185  			flags &^= SecCheckEnter
   186  		}
   187  		if state.SyscallEnabled(seccheck.SyscallExit, sysno) {
   188  			flags |= SecCheckExit
   189  		} else {
   190  			flags &^= SecCheckExit
   191  		}
   192  		if state.SyscallEnabled(seccheck.SyscallRawEnter, sysno) {
   193  			flags |= SecCheckRawEnter
   194  		} else {
   195  			flags &^= SecCheckRawEnter
   196  		}
   197  		if state.SyscallEnabled(seccheck.SyscallRawExit, sysno) {
   198  			flags |= SecCheckRawExit
   199  		} else {
   200  			flags &^= SecCheckRawExit
   201  		}
   202  		if flags != oldFlags {
   203  			e.enable[sysno].Store(flags)
   204  		}
   205  	}
   206  }
   207  
   208  // Word returns the enable bitfield for sysno.
   209  func (e *SyscallFlagsTable) Word(sysno uintptr) uint32 {
   210  	if sysno <= maxSyscallNum {
   211  		return e.enable[sysno].Load()
   212  	}
   213  	return e.missingEnable.Load()
   214  }
   215  
   216  // Enable sets enable bit `bit` for all syscalls based on s.
   217  //
   218  // Syscalls missing from `s` are disabled.
   219  //
   220  // Syscalls missing from the initial table passed to Init cannot be added as
   221  // individual syscalls. If present in s they will be ignored.
   222  //
   223  // Callers to Word may see either the old or new value while this function
   224  // is executing.
   225  func (e *SyscallFlagsTable) Enable(bit uint32, s map[uintptr]bool, missingEnable bool) {
   226  	e.mu.Lock()
   227  	defer e.mu.Unlock()
   228  
   229  	missingVal := e.missingEnable.Load()
   230  	if missingEnable {
   231  		missingVal |= bit
   232  	} else {
   233  		missingVal &^= bit
   234  	}
   235  	e.missingEnable.Store(missingVal)
   236  
   237  	for num := range e.enable {
   238  		val := e.enable[num].Load()
   239  		if !bits.IsOn32(val, syscallPresent) {
   240  			// Missing.
   241  			e.enable[num].Store(missingVal)
   242  			continue
   243  		}
   244  
   245  		if s[uintptr(num)] {
   246  			val |= bit
   247  		} else {
   248  			val &^= bit
   249  		}
   250  		e.enable[num].Store(val)
   251  	}
   252  }
   253  
   254  // EnableAll sets enable bit bit for all syscalls, present and missing.
   255  func (e *SyscallFlagsTable) EnableAll(bit uint32) {
   256  	e.mu.Lock()
   257  	defer e.mu.Unlock()
   258  
   259  	missingVal := e.missingEnable.Load()
   260  	missingVal |= bit
   261  	e.missingEnable.Store(missingVal)
   262  
   263  	for num := range e.enable {
   264  		val := e.enable[num].Load()
   265  		if !bits.IsOn32(val, syscallPresent) {
   266  			// Missing.
   267  			e.enable[num].Store(missingVal)
   268  			continue
   269  		}
   270  
   271  		val |= bit
   272  		e.enable[num].Store(val)
   273  	}
   274  }
   275  
   276  // Stracer traces syscall execution.
   277  type Stracer interface {
   278  	// SyscallEnter is called on syscall entry.
   279  	//
   280  	// The returned private data is passed to SyscallExit.
   281  	SyscallEnter(t *Task, sysno uintptr, args arch.SyscallArguments, flags uint32) any
   282  
   283  	// SyscallExit is called on syscall exit.
   284  	SyscallExit(context any, t *Task, sysno, rval uintptr, err error)
   285  }
   286  
   287  // SyscallTable is a lookup table of system calls.
   288  //
   289  // Note that a SyscallTable is not savable directly. Instead, they are saved as
   290  // an OS/Arch pair and lookup happens again on restore.
   291  type SyscallTable struct {
   292  	// OS is the operating system that this syscall table implements.
   293  	OS abi.OS
   294  
   295  	// Arch is the architecture that this syscall table targets.
   296  	Arch arch.Arch
   297  
   298  	// The OS version that this syscall table implements.
   299  	Version Version
   300  
   301  	// AuditNumber is a numeric constant that represents the syscall table. If
   302  	// non-zero, auditNumber must be one of the AUDIT_ARCH_* values defined by
   303  	// linux/audit.h.
   304  	AuditNumber uint32
   305  
   306  	// Table is the collection of functions.
   307  	Table map[uintptr]Syscall
   308  
   309  	// lookup is a fixed-size array that holds the syscalls (indexed by
   310  	// their numbers). It is used for fast look ups.
   311  	lookup [maxSyscallNum + 1]SyscallFn
   312  
   313  	// pointCallbacks is a fixed-size array that holds SyscallToProto callbacks
   314  	// (indexed by syscall numbers). It is used for fast lookups when
   315  	// seccheck.Point is enabled for the syscall.
   316  	pointCallbacks [maxSyscallNum + 1]SyscallToProto
   317  
   318  	// Emulate is a collection of instruction addresses to emulate. The
   319  	// keys are addresses, and the values are system call numbers.
   320  	Emulate map[hostarch.Addr]uintptr
   321  
   322  	// The function to call in case of a missing system call.
   323  	Missing MissingFn
   324  
   325  	// Stracer traces this syscall table.
   326  	Stracer Stracer
   327  
   328  	// External is used to handle an external callback.
   329  	External func(*Kernel)
   330  
   331  	// ExternalFilterBefore is called before External is called before the syscall is executed.
   332  	// External is not called if it returns false.
   333  	ExternalFilterBefore func(*Task, uintptr, arch.SyscallArguments) bool
   334  
   335  	// ExternalFilterAfter is called before External is called after the syscall is executed.
   336  	// External is not called if it returns false.
   337  	ExternalFilterAfter func(*Task, uintptr, arch.SyscallArguments) bool
   338  
   339  	// FeatureEnable stores the strace and one-shot enable bits.
   340  	FeatureEnable SyscallFlagsTable
   341  }
   342  
   343  // MaxSysno returns the largest system call number.
   344  func (s *SyscallTable) MaxSysno() (max uintptr) {
   345  	for num := range s.Table {
   346  		if num > max {
   347  			max = num
   348  		}
   349  	}
   350  	return max
   351  }
   352  
   353  // allSyscallTables contains all known tables.
   354  var allSyscallTables []*SyscallTable
   355  
   356  var (
   357  	// unimplementedSyscallCounterInit ensures the following fields are only initialized once.
   358  	unimplementedSyscallCounterInit sync.Once
   359  
   360  	// unimplementedSyscallNumbers maps syscall numbers to their string representation.
   361  	// Used such that incrementing unimplementedSyscallCounter does not require allocating memory.
   362  	// Each element in the mapped slices are of length 1, as there is only one field for the
   363  	// unimplemented syscall counter metric. Allocating a slice is necessary as it is passed as a
   364  	// variadic argument to the metric library.
   365  	unimplementedSyscallNumbers map[uintptr][]*metric.FieldValue
   366  
   367  	// unimplementedSyscallCounter tracks the number of times each unimplemented syscall has been
   368  	// called by the sandboxed application.
   369  	unimplementedSyscallCounter *metric.Uint64Metric
   370  )
   371  
   372  // SyscallTables returns a read-only slice of registered SyscallTables.
   373  func SyscallTables() []*SyscallTable {
   374  	return allSyscallTables
   375  }
   376  
   377  // LookupSyscallTable returns the SyscallCall table for the OS/Arch combination.
   378  func LookupSyscallTable(os abi.OS, a arch.Arch) (*SyscallTable, bool) {
   379  	for _, s := range allSyscallTables {
   380  		if s.OS == os && s.Arch == a {
   381  			return s, true
   382  		}
   383  	}
   384  	return nil, false
   385  }
   386  
   387  // RegisterSyscallTable registers a new syscall table for use by a Kernel.
   388  func RegisterSyscallTable(s *SyscallTable) {
   389  	if max := s.MaxSysno(); max > maxSyscallNum {
   390  		panic(fmt.Sprintf("SyscallTable %+v contains too large syscall number %d", s, max))
   391  	}
   392  	if _, ok := LookupSyscallTable(s.OS, s.Arch); ok {
   393  		panic(fmt.Sprintf("Duplicate SyscallTable registered for OS %v Arch %v", s.OS, s.Arch))
   394  	}
   395  	allSyscallTables = append(allSyscallTables, s)
   396  	unimplementedSyscallCounterInit.Do(func() {
   397  		allowedValues := make([]*metric.FieldValue, maxSyscallNum+2)
   398  		unimplementedSyscallNumbers = make(map[uintptr][]*metric.FieldValue, len(allowedValues))
   399  		for i := uintptr(0); i <= maxSyscallNum; i++ {
   400  			s := &metric.FieldValue{strconv.Itoa(int(i))}
   401  			allowedValues[i] = s
   402  			unimplementedSyscallNumbers[i] = []*metric.FieldValue{s}
   403  		}
   404  		allowedValues[len(allowedValues)-1] = outOfRangeSyscallNumber[0]
   405  		unimplementedSyscallCounter = metric.MustCreateNewUint64Metric("/unimplemented_syscalls", true, "Number of times the application tried to call an unimplemented syscall, broken down by syscall number", metric.NewField("sysno", allowedValues...))
   406  	})
   407  	s.Init()
   408  }
   409  
   410  // Init initializes the system call table.
   411  //
   412  // This should normally be called only during registration.
   413  func (s *SyscallTable) Init() {
   414  	if s.Table == nil {
   415  		// Ensure non-nil lookup table.
   416  		s.Table = make(map[uintptr]Syscall)
   417  	}
   418  	if s.Emulate == nil {
   419  		// Ensure non-nil emulate table.
   420  		s.Emulate = make(map[hostarch.Addr]uintptr)
   421  	}
   422  
   423  	// Initialize the fast-lookup tables.
   424  	for num, sc := range s.Table {
   425  		s.lookup[num] = sc.Fn
   426  	}
   427  	for num, sc := range s.Table {
   428  		s.pointCallbacks[num] = sc.PointCallback
   429  	}
   430  
   431  	// Initialize all features.
   432  	s.FeatureEnable.init(s.Table)
   433  }
   434  
   435  // Lookup returns the syscall implementation, if one exists.
   436  func (s *SyscallTable) Lookup(sysno uintptr) SyscallFn {
   437  	if sysno <= maxSyscallNum {
   438  		return s.lookup[sysno]
   439  	}
   440  	return nil
   441  }
   442  
   443  // LookupName looks up a syscall name.
   444  func (s *SyscallTable) LookupName(sysno uintptr) string {
   445  	if sc, ok := s.Table[sysno]; ok {
   446  		return sc.Name
   447  	}
   448  	return fmt.Sprintf("sys_%d", sysno) // Unlikely.
   449  }
   450  
   451  // LookupNo looks up a syscall number by name.
   452  func (s *SyscallTable) LookupNo(name string) (uintptr, error) {
   453  	for i, syscall := range s.Table {
   454  		if syscall.Name == name {
   455  			return uintptr(i), nil
   456  		}
   457  	}
   458  	return 0, fmt.Errorf("syscall %q not found", name)
   459  }
   460  
   461  // LookupEmulate looks up an emulation syscall number.
   462  func (s *SyscallTable) LookupEmulate(addr hostarch.Addr) (uintptr, bool) {
   463  	sysno, ok := s.Emulate[addr]
   464  	return sysno, ok
   465  }
   466  
   467  // mapLookup is similar to Lookup, except that it only uses the syscall table,
   468  // that is, it skips the fast look array. This is available for benchmarking.
   469  func (s *SyscallTable) mapLookup(sysno uintptr) SyscallFn {
   470  	if sc, ok := s.Table[sysno]; ok {
   471  		return sc.Fn
   472  	}
   473  	return nil
   474  }
   475  
   476  // LookupSyscallToProto looks up the SyscallToProto callback for the given
   477  // syscall. It may return nil if none is registered.
   478  func (s *SyscallTable) LookupSyscallToProto(sysno uintptr) SyscallToProto {
   479  	if sysno > maxSyscallNum {
   480  		return nil
   481  	}
   482  	return s.pointCallbacks[sysno]
   483  }
   484  
   485  // SyscallToProto is a callback function that converts generic syscall data to
   486  // schematized protobuf for the corresponding syscall.
   487  type SyscallToProto func(*Task, seccheck.FieldSet, *pb.ContextData, SyscallInfo) (proto.Message, pb.MessageType)
   488  
   489  // SyscallInfo provides generic information about the syscall.
   490  type SyscallInfo struct {
   491  	Exit  bool
   492  	Sysno uintptr
   493  	Args  arch.SyscallArguments
   494  	Rval  uintptr
   495  	Errno int
   496  }
   497  
   498  // IncrementUnimplementedSyscallCounter increments the "unimplemented syscall" metric for the given
   499  // syscall number.
   500  // A syscall table must have been initialized prior to calling this function.
   501  // +checkescape:all
   502  //
   503  //go:nosplit
   504  func IncrementUnimplementedSyscallCounter(sysno uintptr) {
   505  	s, found := unimplementedSyscallNumbers[sysno]
   506  	if !found {
   507  		s = outOfRangeSyscallNumber
   508  	}
   509  	unimplementedSyscallCounter.Increment(s...)
   510  }