github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/kernel/syscalls.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 import ( 18 "fmt" 19 "sync/atomic" 20 21 "github.com/SagerNet/gvisor/pkg/abi" 22 "github.com/SagerNet/gvisor/pkg/bits" 23 "github.com/SagerNet/gvisor/pkg/hostarch" 24 "github.com/SagerNet/gvisor/pkg/sentry/arch" 25 "github.com/SagerNet/gvisor/pkg/sync" 26 ) 27 28 // maxSyscallNum is the highest supported syscall number. 29 // 30 // The types below create fast lookup slices for all syscalls. This maximum 31 // serves as a sanity check that we don't allocate huge slices for a very large 32 // syscall. This is checked during registration. 33 const maxSyscallNum = 2000 34 35 // SyscallSupportLevel is a syscall support levels. 36 type SyscallSupportLevel int 37 38 // String returns a human readable represetation of the support level. 39 func (l SyscallSupportLevel) String() string { 40 switch l { 41 case SupportUnimplemented: 42 return "Unimplemented" 43 case SupportPartial: 44 return "Partial Support" 45 case SupportFull: 46 return "Full Support" 47 default: 48 return "Undocumented" 49 } 50 } 51 52 const ( 53 // SupportUndocumented indicates the syscall is not documented yet. 54 SupportUndocumented = iota 55 56 // SupportUnimplemented indicates the syscall is unimplemented. 57 SupportUnimplemented 58 59 // SupportPartial indicates the syscall is partially supported. 60 SupportPartial 61 62 // SupportFull indicates the syscall is fully supported. 63 SupportFull 64 ) 65 66 // Syscall includes the syscall implementation and compatibility information. 67 type Syscall struct { 68 // Name is the syscall name. 69 Name string 70 // Fn is the implementation of the syscall. 71 Fn SyscallFn 72 // SupportLevel is the level of support implemented in gVisor. 73 SupportLevel SyscallSupportLevel 74 // Note describes the compatibility of the syscall. 75 Note string 76 // URLs is set of URLs to any relevant bugs or issues. 77 URLs []string 78 } 79 80 // SyscallFn is a syscall implementation. 81 type SyscallFn func(t *Task, args arch.SyscallArguments) (uintptr, *SyscallControl, error) 82 83 // MissingFn is a syscall to be called when an implementation is missing. 84 type MissingFn func(t *Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) 85 86 // Possible flags for SyscallFlagsTable.enable. 87 const ( 88 // syscallPresent indicates that this is not a missing syscall. 89 // 90 // This flag is used internally in SyscallFlagsTable. 91 syscallPresent = 1 << iota 92 93 // StraceEnableLog enables syscall log tracing. 94 StraceEnableLog 95 96 // StraceEnableEvent enables syscall event tracing. 97 StraceEnableEvent 98 99 // ExternalBeforeEnable enables the external hook before syscall execution. 100 ExternalBeforeEnable 101 102 // ExternalAfterEnable enables the external hook after syscall execution. 103 ExternalAfterEnable 104 ) 105 106 // StraceEnableBits combines both strace log and event flags. 107 const StraceEnableBits = StraceEnableLog | StraceEnableEvent 108 109 // SyscallFlagsTable manages a set of enable/disable bit fields on a per-syscall 110 // basis. 111 type SyscallFlagsTable struct { 112 // mu protects writes to the fields below. 113 // 114 // Atomic loads are always allowed. Atomic stores are allowed only 115 // while mu is held. 116 mu sync.Mutex 117 118 // enable contains the enable bits for each syscall. 119 // 120 // missing syscalls have the same value in enable as missingEnable to 121 // avoid an extra branch in Word. 122 enable []uint32 123 124 // missingEnable contains the enable bits for missing syscalls. 125 missingEnable uint32 126 } 127 128 // Init initializes the struct, with all syscalls in table set to enable. 129 // 130 // max is the largest syscall number in table. 131 func (e *SyscallFlagsTable) init(table map[uintptr]Syscall, max uintptr) { 132 e.enable = make([]uint32, max+1) 133 for num := range table { 134 e.enable[num] = syscallPresent 135 } 136 } 137 138 // Word returns the enable bitfield for sysno. 139 func (e *SyscallFlagsTable) Word(sysno uintptr) uint32 { 140 if sysno < uintptr(len(e.enable)) { 141 return atomic.LoadUint32(&e.enable[sysno]) 142 } 143 144 return atomic.LoadUint32(&e.missingEnable) 145 } 146 147 // Enable sets enable bit bit for all syscalls based on s. 148 // 149 // Syscalls missing from s are disabled. 150 // 151 // Syscalls missing from the initial table passed to Init cannot be added as 152 // individual syscalls. If present in s they will be ignored. 153 // 154 // Callers to Word may see either the old or new value while this function 155 // is executing. 156 func (e *SyscallFlagsTable) Enable(bit uint32, s map[uintptr]bool, missingEnable bool) { 157 e.mu.Lock() 158 defer e.mu.Unlock() 159 160 missingVal := atomic.LoadUint32(&e.missingEnable) 161 if missingEnable { 162 missingVal |= bit 163 } else { 164 missingVal &^= bit 165 } 166 atomic.StoreUint32(&e.missingEnable, missingVal) 167 168 for num := range e.enable { 169 val := atomic.LoadUint32(&e.enable[num]) 170 if !bits.IsOn32(val, syscallPresent) { 171 // Missing. 172 atomic.StoreUint32(&e.enable[num], missingVal) 173 continue 174 } 175 176 if s[uintptr(num)] { 177 val |= bit 178 } else { 179 val &^= bit 180 } 181 atomic.StoreUint32(&e.enable[num], val) 182 } 183 } 184 185 // EnableAll sets enable bit bit for all syscalls, present and missing. 186 func (e *SyscallFlagsTable) EnableAll(bit uint32) { 187 e.mu.Lock() 188 defer e.mu.Unlock() 189 190 missingVal := atomic.LoadUint32(&e.missingEnable) 191 missingVal |= bit 192 atomic.StoreUint32(&e.missingEnable, missingVal) 193 194 for num := range e.enable { 195 val := atomic.LoadUint32(&e.enable[num]) 196 if !bits.IsOn32(val, syscallPresent) { 197 // Missing. 198 atomic.StoreUint32(&e.enable[num], missingVal) 199 continue 200 } 201 202 val |= bit 203 atomic.StoreUint32(&e.enable[num], val) 204 } 205 } 206 207 // Stracer traces syscall execution. 208 type Stracer interface { 209 // SyscallEnter is called on syscall entry. 210 // 211 // The returned private data is passed to SyscallExit. 212 SyscallEnter(t *Task, sysno uintptr, args arch.SyscallArguments, flags uint32) interface{} 213 214 // SyscallExit is called on syscall exit. 215 SyscallExit(context interface{}, t *Task, sysno, rval uintptr, err error) 216 } 217 218 // SyscallTable is a lookup table of system calls. 219 // 220 // Note that a SyscallTable is not savable directly. Instead, they are saved as 221 // an OS/Arch pair and lookup happens again on restore. 222 type SyscallTable struct { 223 // OS is the operating system that this syscall table implements. 224 OS abi.OS 225 226 // Arch is the architecture that this syscall table targets. 227 Arch arch.Arch 228 229 // The OS version that this syscall table implements. 230 Version Version 231 232 // AuditNumber is a numeric constant that represents the syscall table. If 233 // non-zero, auditNumber must be one of the AUDIT_ARCH_* values defined by 234 // linux/audit.h. 235 AuditNumber uint32 236 237 // Table is the collection of functions. 238 Table map[uintptr]Syscall 239 240 // lookup is a fixed-size array that holds the syscalls (indexed by 241 // their numbers). It is used for fast look ups. 242 lookup []SyscallFn 243 244 // Emulate is a collection of instruction addresses to emulate. The 245 // keys are addresses, and the values are system call numbers. 246 Emulate map[hostarch.Addr]uintptr 247 248 // The function to call in case of a missing system call. 249 Missing MissingFn 250 251 // Stracer traces this syscall table. 252 Stracer Stracer 253 254 // External is used to handle an external callback. 255 External func(*Kernel) 256 257 // ExternalFilterBefore is called before External is called before the syscall is executed. 258 // External is not called if it returns false. 259 ExternalFilterBefore func(*Task, uintptr, arch.SyscallArguments) bool 260 261 // ExternalFilterAfter is called before External is called after the syscall is executed. 262 // External is not called if it returns false. 263 ExternalFilterAfter func(*Task, uintptr, arch.SyscallArguments) bool 264 265 // FeatureEnable stores the strace and one-shot enable bits. 266 FeatureEnable SyscallFlagsTable 267 } 268 269 // MaxSysno returns the largest system call number. 270 func (s *SyscallTable) MaxSysno() (max uintptr) { 271 for num := range s.Table { 272 if num > max { 273 max = num 274 } 275 } 276 return max 277 } 278 279 // allSyscallTables contains all known tables. 280 var allSyscallTables []*SyscallTable 281 282 // SyscallTables returns a read-only slice of registered SyscallTables. 283 func SyscallTables() []*SyscallTable { 284 return allSyscallTables 285 } 286 287 // LookupSyscallTable returns the SyscallCall table for the OS/Arch combination. 288 func LookupSyscallTable(os abi.OS, a arch.Arch) (*SyscallTable, bool) { 289 for _, s := range allSyscallTables { 290 if s.OS == os && s.Arch == a { 291 return s, true 292 } 293 } 294 return nil, false 295 } 296 297 // RegisterSyscallTable registers a new syscall table for use by a Kernel. 298 func RegisterSyscallTable(s *SyscallTable) { 299 if max := s.MaxSysno(); max > maxSyscallNum { 300 panic(fmt.Sprintf("SyscallTable %+v contains too large syscall number %d", s, max)) 301 } 302 if _, ok := LookupSyscallTable(s.OS, s.Arch); ok { 303 panic(fmt.Sprintf("Duplicate SyscallTable registered for OS %v Arch %v", s.OS, s.Arch)) 304 } 305 allSyscallTables = append(allSyscallTables, s) 306 s.Init() 307 } 308 309 // Init initializes the system call table. 310 // 311 // This should normally be called only during registration. 312 func (s *SyscallTable) Init() { 313 if s.Table == nil { 314 // Ensure non-nil lookup table. 315 s.Table = make(map[uintptr]Syscall) 316 } 317 if s.Emulate == nil { 318 // Ensure non-nil emulate table. 319 s.Emulate = make(map[hostarch.Addr]uintptr) 320 } 321 322 max := s.MaxSysno() // Checked during RegisterSyscallTable. 323 324 // Initialize the fast-lookup table. 325 s.lookup = make([]SyscallFn, max+1) 326 for num, sc := range s.Table { 327 s.lookup[num] = sc.Fn 328 } 329 330 // Initialize all features. 331 s.FeatureEnable.init(s.Table, max) 332 } 333 334 // Lookup returns the syscall implementation, if one exists. 335 func (s *SyscallTable) Lookup(sysno uintptr) SyscallFn { 336 if sysno < uintptr(len(s.lookup)) { 337 return s.lookup[sysno] 338 } 339 340 return nil 341 } 342 343 // LookupName looks up a syscall name. 344 func (s *SyscallTable) LookupName(sysno uintptr) string { 345 if sc, ok := s.Table[sysno]; ok { 346 return sc.Name 347 } 348 return fmt.Sprintf("sys_%d", sysno) // Unlikely. 349 } 350 351 // LookupNo looks up a syscall number by name. 352 func (s *SyscallTable) LookupNo(name string) (uintptr, error) { 353 for i, syscall := range s.Table { 354 if syscall.Name == name { 355 return uintptr(i), nil 356 } 357 } 358 return 0, fmt.Errorf("syscall %q not found", name) 359 } 360 361 // LookupEmulate looks up an emulation syscall number. 362 func (s *SyscallTable) LookupEmulate(addr hostarch.Addr) (uintptr, bool) { 363 sysno, ok := s.Emulate[addr] 364 return sysno, ok 365 } 366 367 // mapLookup is similar to Lookup, except that it only uses the syscall table, 368 // that is, it skips the fast look array. This is available for benchmarking. 369 func (s *SyscallTable) mapLookup(sysno uintptr) SyscallFn { 370 if sc, ok := s.Table[sysno]; ok { 371 return sc.Fn 372 } 373 return nil 374 }