github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/kernel/syscalls.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  import (
    18  	"fmt"
    19  	"sync/atomic"
    20  
    21  	"github.com/SagerNet/gvisor/pkg/abi"
    22  	"github.com/SagerNet/gvisor/pkg/bits"
    23  	"github.com/SagerNet/gvisor/pkg/hostarch"
    24  	"github.com/SagerNet/gvisor/pkg/sentry/arch"
    25  	"github.com/SagerNet/gvisor/pkg/sync"
    26  )
    27  
    28  // maxSyscallNum is the highest supported syscall number.
    29  //
    30  // The types below create fast lookup slices for all syscalls. This maximum
    31  // serves as a sanity check that we don't allocate huge slices for a very large
    32  // syscall. This is checked during registration.
    33  const maxSyscallNum = 2000
    34  
    35  // SyscallSupportLevel is a syscall support levels.
    36  type SyscallSupportLevel int
    37  
    38  // String returns a human readable represetation of the support level.
    39  func (l SyscallSupportLevel) String() string {
    40  	switch l {
    41  	case SupportUnimplemented:
    42  		return "Unimplemented"
    43  	case SupportPartial:
    44  		return "Partial Support"
    45  	case SupportFull:
    46  		return "Full Support"
    47  	default:
    48  		return "Undocumented"
    49  	}
    50  }
    51  
    52  const (
    53  	// SupportUndocumented indicates the syscall is not documented yet.
    54  	SupportUndocumented = iota
    55  
    56  	// SupportUnimplemented indicates the syscall is unimplemented.
    57  	SupportUnimplemented
    58  
    59  	// SupportPartial indicates the syscall is partially supported.
    60  	SupportPartial
    61  
    62  	// SupportFull indicates the syscall is fully supported.
    63  	SupportFull
    64  )
    65  
    66  // Syscall includes the syscall implementation and compatibility information.
    67  type Syscall struct {
    68  	// Name is the syscall name.
    69  	Name string
    70  	// Fn is the implementation of the syscall.
    71  	Fn SyscallFn
    72  	// SupportLevel is the level of support implemented in gVisor.
    73  	SupportLevel SyscallSupportLevel
    74  	// Note describes the compatibility of the syscall.
    75  	Note string
    76  	// URLs is set of URLs to any relevant bugs or issues.
    77  	URLs []string
    78  }
    79  
    80  // SyscallFn is a syscall implementation.
    81  type SyscallFn func(t *Task, args arch.SyscallArguments) (uintptr, *SyscallControl, error)
    82  
    83  // MissingFn is a syscall to be called when an implementation is missing.
    84  type MissingFn func(t *Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error)
    85  
    86  // Possible flags for SyscallFlagsTable.enable.
    87  const (
    88  	// syscallPresent indicates that this is not a missing syscall.
    89  	//
    90  	// This flag is used internally in SyscallFlagsTable.
    91  	syscallPresent = 1 << iota
    92  
    93  	// StraceEnableLog enables syscall log tracing.
    94  	StraceEnableLog
    95  
    96  	// StraceEnableEvent enables syscall event tracing.
    97  	StraceEnableEvent
    98  
    99  	// ExternalBeforeEnable enables the external hook before syscall execution.
   100  	ExternalBeforeEnable
   101  
   102  	// ExternalAfterEnable enables the external hook after syscall execution.
   103  	ExternalAfterEnable
   104  )
   105  
   106  // StraceEnableBits combines both strace log and event flags.
   107  const StraceEnableBits = StraceEnableLog | StraceEnableEvent
   108  
   109  // SyscallFlagsTable manages a set of enable/disable bit fields on a per-syscall
   110  // basis.
   111  type SyscallFlagsTable struct {
   112  	// mu protects writes to the fields below.
   113  	//
   114  	// Atomic loads are always allowed. Atomic stores are allowed only
   115  	// while mu is held.
   116  	mu sync.Mutex
   117  
   118  	// enable contains the enable bits for each syscall.
   119  	//
   120  	// missing syscalls have the same value in enable as missingEnable to
   121  	// avoid an extra branch in Word.
   122  	enable []uint32
   123  
   124  	// missingEnable contains the enable bits for missing syscalls.
   125  	missingEnable uint32
   126  }
   127  
   128  // Init initializes the struct, with all syscalls in table set to enable.
   129  //
   130  // max is the largest syscall number in table.
   131  func (e *SyscallFlagsTable) init(table map[uintptr]Syscall, max uintptr) {
   132  	e.enable = make([]uint32, max+1)
   133  	for num := range table {
   134  		e.enable[num] = syscallPresent
   135  	}
   136  }
   137  
   138  // Word returns the enable bitfield for sysno.
   139  func (e *SyscallFlagsTable) Word(sysno uintptr) uint32 {
   140  	if sysno < uintptr(len(e.enable)) {
   141  		return atomic.LoadUint32(&e.enable[sysno])
   142  	}
   143  
   144  	return atomic.LoadUint32(&e.missingEnable)
   145  }
   146  
   147  // Enable sets enable bit bit for all syscalls based on s.
   148  //
   149  // Syscalls missing from s are disabled.
   150  //
   151  // Syscalls missing from the initial table passed to Init cannot be added as
   152  // individual syscalls. If present in s they will be ignored.
   153  //
   154  // Callers to Word may see either the old or new value while this function
   155  // is executing.
   156  func (e *SyscallFlagsTable) Enable(bit uint32, s map[uintptr]bool, missingEnable bool) {
   157  	e.mu.Lock()
   158  	defer e.mu.Unlock()
   159  
   160  	missingVal := atomic.LoadUint32(&e.missingEnable)
   161  	if missingEnable {
   162  		missingVal |= bit
   163  	} else {
   164  		missingVal &^= bit
   165  	}
   166  	atomic.StoreUint32(&e.missingEnable, missingVal)
   167  
   168  	for num := range e.enable {
   169  		val := atomic.LoadUint32(&e.enable[num])
   170  		if !bits.IsOn32(val, syscallPresent) {
   171  			// Missing.
   172  			atomic.StoreUint32(&e.enable[num], missingVal)
   173  			continue
   174  		}
   175  
   176  		if s[uintptr(num)] {
   177  			val |= bit
   178  		} else {
   179  			val &^= bit
   180  		}
   181  		atomic.StoreUint32(&e.enable[num], val)
   182  	}
   183  }
   184  
   185  // EnableAll sets enable bit bit for all syscalls, present and missing.
   186  func (e *SyscallFlagsTable) EnableAll(bit uint32) {
   187  	e.mu.Lock()
   188  	defer e.mu.Unlock()
   189  
   190  	missingVal := atomic.LoadUint32(&e.missingEnable)
   191  	missingVal |= bit
   192  	atomic.StoreUint32(&e.missingEnable, missingVal)
   193  
   194  	for num := range e.enable {
   195  		val := atomic.LoadUint32(&e.enable[num])
   196  		if !bits.IsOn32(val, syscallPresent) {
   197  			// Missing.
   198  			atomic.StoreUint32(&e.enable[num], missingVal)
   199  			continue
   200  		}
   201  
   202  		val |= bit
   203  		atomic.StoreUint32(&e.enable[num], val)
   204  	}
   205  }
   206  
   207  // Stracer traces syscall execution.
   208  type Stracer interface {
   209  	// SyscallEnter is called on syscall entry.
   210  	//
   211  	// The returned private data is passed to SyscallExit.
   212  	SyscallEnter(t *Task, sysno uintptr, args arch.SyscallArguments, flags uint32) interface{}
   213  
   214  	// SyscallExit is called on syscall exit.
   215  	SyscallExit(context interface{}, t *Task, sysno, rval uintptr, err error)
   216  }
   217  
   218  // SyscallTable is a lookup table of system calls.
   219  //
   220  // Note that a SyscallTable is not savable directly. Instead, they are saved as
   221  // an OS/Arch pair and lookup happens again on restore.
   222  type SyscallTable struct {
   223  	// OS is the operating system that this syscall table implements.
   224  	OS abi.OS
   225  
   226  	// Arch is the architecture that this syscall table targets.
   227  	Arch arch.Arch
   228  
   229  	// The OS version that this syscall table implements.
   230  	Version Version
   231  
   232  	// AuditNumber is a numeric constant that represents the syscall table. If
   233  	// non-zero, auditNumber must be one of the AUDIT_ARCH_* values defined by
   234  	// linux/audit.h.
   235  	AuditNumber uint32
   236  
   237  	// Table is the collection of functions.
   238  	Table map[uintptr]Syscall
   239  
   240  	// lookup is a fixed-size array that holds the syscalls (indexed by
   241  	// their numbers). It is used for fast look ups.
   242  	lookup []SyscallFn
   243  
   244  	// Emulate is a collection of instruction addresses to emulate. The
   245  	// keys are addresses, and the values are system call numbers.
   246  	Emulate map[hostarch.Addr]uintptr
   247  
   248  	// The function to call in case of a missing system call.
   249  	Missing MissingFn
   250  
   251  	// Stracer traces this syscall table.
   252  	Stracer Stracer
   253  
   254  	// External is used to handle an external callback.
   255  	External func(*Kernel)
   256  
   257  	// ExternalFilterBefore is called before External is called before the syscall is executed.
   258  	// External is not called if it returns false.
   259  	ExternalFilterBefore func(*Task, uintptr, arch.SyscallArguments) bool
   260  
   261  	// ExternalFilterAfter is called before External is called after the syscall is executed.
   262  	// External is not called if it returns false.
   263  	ExternalFilterAfter func(*Task, uintptr, arch.SyscallArguments) bool
   264  
   265  	// FeatureEnable stores the strace and one-shot enable bits.
   266  	FeatureEnable SyscallFlagsTable
   267  }
   268  
   269  // MaxSysno returns the largest system call number.
   270  func (s *SyscallTable) MaxSysno() (max uintptr) {
   271  	for num := range s.Table {
   272  		if num > max {
   273  			max = num
   274  		}
   275  	}
   276  	return max
   277  }
   278  
   279  // allSyscallTables contains all known tables.
   280  var allSyscallTables []*SyscallTable
   281  
   282  // SyscallTables returns a read-only slice of registered SyscallTables.
   283  func SyscallTables() []*SyscallTable {
   284  	return allSyscallTables
   285  }
   286  
   287  // LookupSyscallTable returns the SyscallCall table for the OS/Arch combination.
   288  func LookupSyscallTable(os abi.OS, a arch.Arch) (*SyscallTable, bool) {
   289  	for _, s := range allSyscallTables {
   290  		if s.OS == os && s.Arch == a {
   291  			return s, true
   292  		}
   293  	}
   294  	return nil, false
   295  }
   296  
   297  // RegisterSyscallTable registers a new syscall table for use by a Kernel.
   298  func RegisterSyscallTable(s *SyscallTable) {
   299  	if max := s.MaxSysno(); max > maxSyscallNum {
   300  		panic(fmt.Sprintf("SyscallTable %+v contains too large syscall number %d", s, max))
   301  	}
   302  	if _, ok := LookupSyscallTable(s.OS, s.Arch); ok {
   303  		panic(fmt.Sprintf("Duplicate SyscallTable registered for OS %v Arch %v", s.OS, s.Arch))
   304  	}
   305  	allSyscallTables = append(allSyscallTables, s)
   306  	s.Init()
   307  }
   308  
   309  // Init initializes the system call table.
   310  //
   311  // This should normally be called only during registration.
   312  func (s *SyscallTable) Init() {
   313  	if s.Table == nil {
   314  		// Ensure non-nil lookup table.
   315  		s.Table = make(map[uintptr]Syscall)
   316  	}
   317  	if s.Emulate == nil {
   318  		// Ensure non-nil emulate table.
   319  		s.Emulate = make(map[hostarch.Addr]uintptr)
   320  	}
   321  
   322  	max := s.MaxSysno() // Checked during RegisterSyscallTable.
   323  
   324  	// Initialize the fast-lookup table.
   325  	s.lookup = make([]SyscallFn, max+1)
   326  	for num, sc := range s.Table {
   327  		s.lookup[num] = sc.Fn
   328  	}
   329  
   330  	// Initialize all features.
   331  	s.FeatureEnable.init(s.Table, max)
   332  }
   333  
   334  // Lookup returns the syscall implementation, if one exists.
   335  func (s *SyscallTable) Lookup(sysno uintptr) SyscallFn {
   336  	if sysno < uintptr(len(s.lookup)) {
   337  		return s.lookup[sysno]
   338  	}
   339  
   340  	return nil
   341  }
   342  
   343  // LookupName looks up a syscall name.
   344  func (s *SyscallTable) LookupName(sysno uintptr) string {
   345  	if sc, ok := s.Table[sysno]; ok {
   346  		return sc.Name
   347  	}
   348  	return fmt.Sprintf("sys_%d", sysno) // Unlikely.
   349  }
   350  
   351  // LookupNo looks up a syscall number by name.
   352  func (s *SyscallTable) LookupNo(name string) (uintptr, error) {
   353  	for i, syscall := range s.Table {
   354  		if syscall.Name == name {
   355  			return uintptr(i), nil
   356  		}
   357  	}
   358  	return 0, fmt.Errorf("syscall %q not found", name)
   359  }
   360  
   361  // LookupEmulate looks up an emulation syscall number.
   362  func (s *SyscallTable) LookupEmulate(addr hostarch.Addr) (uintptr, bool) {
   363  	sysno, ok := s.Emulate[addr]
   364  	return sysno, ok
   365  }
   366  
   367  // mapLookup is similar to Lookup, except that it only uses the syscall table,
   368  // that is, it skips the fast look array. This is available for benchmarking.
   369  func (s *SyscallTable) mapLookup(sysno uintptr) SyscallFn {
   370  	if sc, ok := s.Table[sysno]; ok {
   371  		return sc.Fn
   372  	}
   373  	return nil
   374  }