github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/kernel/syscalls.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 import ( 18 "fmt" 19 "strconv" 20 21 "google.golang.org/protobuf/proto" 22 "github.com/metacubex/gvisor/pkg/abi" 23 "github.com/metacubex/gvisor/pkg/abi/sentry" 24 "github.com/metacubex/gvisor/pkg/atomicbitops" 25 "github.com/metacubex/gvisor/pkg/bits" 26 "github.com/metacubex/gvisor/pkg/hostarch" 27 "github.com/metacubex/gvisor/pkg/metric" 28 "github.com/metacubex/gvisor/pkg/sentry/arch" 29 "github.com/metacubex/gvisor/pkg/sentry/seccheck" 30 pb "github.com/metacubex/gvisor/pkg/sentry/seccheck/points/points_go_proto" 31 "github.com/metacubex/gvisor/pkg/sync" 32 ) 33 34 // outOfRangeSyscallNumber is used to represent a syscall number that is out of the 35 // range [0, maxSyscallNum] in monitoring. 36 var outOfRangeSyscallNumber = []*metric.FieldValue{&metric.FieldValue{"-1"}} 37 38 // SyscallSupportLevel is a syscall support levels. 39 type SyscallSupportLevel int 40 41 // String returns a human readable representation of the support level. 42 func (l SyscallSupportLevel) String() string { 43 switch l { 44 case SupportUnimplemented: 45 return "Unimplemented" 46 case SupportPartial: 47 return "Partial Support" 48 case SupportFull: 49 return "Full Support" 50 default: 51 return "Undocumented" 52 } 53 } 54 55 const ( 56 // SupportUndocumented indicates the syscall is not documented yet. 57 SupportUndocumented = iota 58 59 // SupportUnimplemented indicates the syscall is unimplemented. 60 SupportUnimplemented 61 62 // SupportPartial indicates the syscall is partially supported. 63 SupportPartial 64 65 // SupportFull indicates the syscall is fully supported. 66 SupportFull 67 ) 68 69 // Syscall includes the syscall implementation and compatibility information. 70 type Syscall struct { 71 // Name is the syscall name. 72 Name string 73 // Fn is the implementation of the syscall. 74 Fn SyscallFn 75 // SupportLevel is the level of support implemented in gVisor. 76 SupportLevel SyscallSupportLevel 77 // Note describes the compatibility of the syscall. 78 Note string 79 // URLs is set of URLs to any relevant bugs or issues. 80 URLs []string 81 // PointCallback is an optional callback that converts syscall arguments 82 // to a proto that can be used with seccheck.Sink. 83 // Callback functions must follow this naming convention: 84 // PointSyscallNameInCamelCase, e.g. PointReadat, PointRtSigaction. 85 PointCallback SyscallToProto 86 } 87 88 // SyscallFn is a syscall implementation. 89 type SyscallFn func(t *Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *SyscallControl, error) 90 91 // MissingFn is a syscall to be called when an implementation is missing. 92 type MissingFn func(t *Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) 93 94 // Possible flags for SyscallFlagsTable.enable. 95 const ( 96 // syscallPresent indicates that this is not a missing syscall. 97 // 98 // This flag is used internally in SyscallFlagsTable. 99 syscallPresent = 1 << iota 100 101 // StraceEnableLog enables syscall log tracing. 102 StraceEnableLog 103 104 // StraceEnableEvent enables syscall event tracing. 105 StraceEnableEvent 106 107 // ExternalBeforeEnable enables the external hook before syscall execution. 108 ExternalBeforeEnable 109 110 // ExternalAfterEnable enables the external hook after syscall execution. 111 ExternalAfterEnable 112 113 // SecCheckEnter represents a schematized/enter syscall seccheck event. 114 SecCheckEnter 115 116 // SecCheckExit represents a schematized/exit syscall seccheck event. 117 SecCheckExit 118 119 // SecCheckRawEnter represents raw/enter syscall seccheck event. 120 SecCheckRawEnter 121 122 // SecCheckRawExit represents raw/exit syscall seccheck event. 123 SecCheckRawExit 124 ) 125 126 // StraceEnableBits combines both strace log and event flags. 127 const StraceEnableBits = StraceEnableLog | StraceEnableEvent 128 129 // SyscallFlagsTable manages a set of enable/disable bit fields on a per-syscall 130 // basis. 131 type SyscallFlagsTable struct { 132 // mu protects writes to the fields below. 133 // 134 // Atomic loads are always allowed. Atomic stores are allowed only 135 // while mu is held. 136 mu sync.Mutex 137 138 // enable contains the enable bits for each syscall. 139 // 140 // missing syscalls have the same value in enable as missingEnable to 141 // avoid an extra branch in Word. 142 enable [sentry.MaxSyscallNum + 1]atomicbitops.Uint32 143 144 // missingEnable contains the enable bits for missing syscalls. 145 missingEnable atomicbitops.Uint32 146 } 147 148 // Init initializes the struct, with all syscalls in table set to enable. 149 // 150 // max is the largest syscall number in table. 151 func (e *SyscallFlagsTable) init(table map[uintptr]Syscall) { 152 for num := range table { 153 enableFlags := uint32(syscallPresent) 154 e.enable[num] = atomicbitops.FromUint32(enableFlags) 155 } 156 seccheck.Global.AddSyscallFlagListener(e) 157 e.UpdateSecCheck(&seccheck.Global) 158 } 159 160 // UpdateSecCheck implements seccheck.SyscallFlagListener. 161 // 162 // It is called when per-syscall seccheck event enablement changes. 163 func (e *SyscallFlagsTable) UpdateSecCheck(state *seccheck.State) { 164 e.mu.Lock() 165 defer e.mu.Unlock() 166 for sysno := uintptr(0); sysno <= sentry.MaxSyscallNum; sysno++ { 167 oldFlags := e.enable[sysno].Load() 168 if !bits.IsOn32(oldFlags, syscallPresent) { 169 continue 170 } 171 flags := oldFlags 172 if state.SyscallEnabled(seccheck.SyscallEnter, sysno) { 173 flags |= SecCheckEnter 174 } else { 175 flags &^= SecCheckEnter 176 } 177 if state.SyscallEnabled(seccheck.SyscallExit, sysno) { 178 flags |= SecCheckExit 179 } else { 180 flags &^= SecCheckExit 181 } 182 if state.SyscallEnabled(seccheck.SyscallRawEnter, sysno) { 183 flags |= SecCheckRawEnter 184 } else { 185 flags &^= SecCheckRawEnter 186 } 187 if state.SyscallEnabled(seccheck.SyscallRawExit, sysno) { 188 flags |= SecCheckRawExit 189 } else { 190 flags &^= SecCheckRawExit 191 } 192 if flags != oldFlags { 193 e.enable[sysno].Store(flags) 194 } 195 } 196 } 197 198 // Word returns the enable bitfield for sysno. 199 func (e *SyscallFlagsTable) Word(sysno uintptr) uint32 { 200 if sysno <= sentry.MaxSyscallNum { 201 return e.enable[sysno].Load() 202 } 203 return e.missingEnable.Load() 204 } 205 206 // Enable sets enable bit `bit` for all syscalls based on s. 207 // 208 // Syscalls missing from `s` are disabled. 209 // 210 // Syscalls missing from the initial table passed to Init cannot be added as 211 // individual syscalls. If present in s they will be ignored. 212 // 213 // Callers to Word may see either the old or new value while this function 214 // is executing. 215 func (e *SyscallFlagsTable) Enable(bit uint32, s map[uintptr]bool, missingEnable bool) { 216 e.mu.Lock() 217 defer e.mu.Unlock() 218 219 missingVal := e.missingEnable.Load() 220 if missingEnable { 221 missingVal |= bit 222 } else { 223 missingVal &^= bit 224 } 225 e.missingEnable.Store(missingVal) 226 227 for num := range e.enable { 228 val := e.enable[num].Load() 229 if !bits.IsOn32(val, syscallPresent) { 230 // Missing. 231 e.enable[num].Store(missingVal) 232 continue 233 } 234 235 if s[uintptr(num)] { 236 val |= bit 237 } else { 238 val &^= bit 239 } 240 e.enable[num].Store(val) 241 } 242 } 243 244 // EnableAll sets enable bit bit for all syscalls, present and missing. 245 func (e *SyscallFlagsTable) EnableAll(bit uint32) { 246 e.mu.Lock() 247 defer e.mu.Unlock() 248 249 missingVal := e.missingEnable.Load() 250 missingVal |= bit 251 e.missingEnable.Store(missingVal) 252 253 for num := range e.enable { 254 val := e.enable[num].Load() 255 if !bits.IsOn32(val, syscallPresent) { 256 // Missing. 257 e.enable[num].Store(missingVal) 258 continue 259 } 260 261 val |= bit 262 e.enable[num].Store(val) 263 } 264 } 265 266 // Stracer traces syscall execution. 267 type Stracer interface { 268 // SyscallEnter is called on syscall entry. 269 // 270 // The returned private data is passed to SyscallExit. 271 SyscallEnter(t *Task, sysno uintptr, args arch.SyscallArguments, flags uint32) any 272 273 // SyscallExit is called on syscall exit. 274 SyscallExit(context any, t *Task, sysno, rval uintptr, err error) 275 } 276 277 // SyscallTable is a lookup table of system calls. 278 // 279 // Note that a SyscallTable is not savable directly. Instead, they are saved as 280 // an OS/Arch pair and lookup happens again on restore. 281 type SyscallTable struct { 282 // OS is the operating system that this syscall table implements. 283 OS abi.OS 284 285 // Arch is the architecture that this syscall table targets. 286 Arch arch.Arch 287 288 // The OS version that this syscall table implements. 289 Version Version 290 291 // AuditNumber is a numeric constant that represents the syscall table. If 292 // non-zero, auditNumber must be one of the AUDIT_ARCH_* values defined by 293 // linux/audit.h. 294 AuditNumber uint32 295 296 // Table is the collection of functions. 297 Table map[uintptr]Syscall 298 299 // lookup is a fixed-size array that holds the syscalls (indexed by 300 // their numbers). It is used for fast look ups. 301 lookup [sentry.MaxSyscallNum + 1]SyscallFn 302 303 // pointCallbacks is a fixed-size array that holds SyscallToProto callbacks 304 // (indexed by syscall numbers). It is used for fast lookups when 305 // seccheck.Point is enabled for the syscall. 306 pointCallbacks [sentry.MaxSyscallNum + 1]SyscallToProto 307 308 // Emulate is a collection of instruction addresses to emulate. The 309 // keys are addresses, and the values are system call numbers. 310 Emulate map[hostarch.Addr]uintptr 311 312 // The function to call in case of a missing system call. 313 Missing MissingFn 314 315 // Stracer traces this syscall table. 316 Stracer Stracer 317 318 // External is used to handle an external callback. 319 External func(*Kernel) 320 321 // ExternalFilterBefore is called before External is called before the syscall is executed. 322 // External is not called if it returns false. 323 ExternalFilterBefore func(*Task, uintptr, arch.SyscallArguments) bool 324 325 // ExternalFilterAfter is called before External is called after the syscall is executed. 326 // External is not called if it returns false. 327 ExternalFilterAfter func(*Task, uintptr, arch.SyscallArguments) bool 328 329 // FeatureEnable stores the strace and one-shot enable bits. 330 FeatureEnable SyscallFlagsTable 331 } 332 333 // MaxSysno returns the largest system call number. 334 func (s *SyscallTable) MaxSysno() (max uintptr) { 335 for num := range s.Table { 336 if num > max { 337 max = num 338 } 339 } 340 return max 341 } 342 343 // allSyscallTables contains all known tables. 344 var allSyscallTables []*SyscallTable 345 346 var ( 347 // unimplementedSyscallCounterInit ensures the following fields are only initialized once. 348 unimplementedSyscallCounterInit sync.Once 349 350 // unimplementedSyscallNumbers maps syscall numbers to their string representation. 351 // Used such that incrementing unimplementedSyscallCounter does not require allocating memory. 352 // Each element in the mapped slices are of length 1, as there is only one field for the 353 // unimplemented syscall counter metric. Allocating a slice is necessary as it is passed as a 354 // variadic argument to the metric library. 355 unimplementedSyscallNumbers map[uintptr][]*metric.FieldValue 356 357 // unimplementedSyscallCounter tracks the number of times each unimplemented syscall has been 358 // called by the sandboxed application. 359 unimplementedSyscallCounter *metric.Uint64Metric 360 ) 361 362 // SyscallTables returns a read-only slice of registered SyscallTables. 363 func SyscallTables() []*SyscallTable { 364 return allSyscallTables 365 } 366 367 // LookupSyscallTable returns the SyscallCall table for the OS/Arch combination. 368 func LookupSyscallTable(os abi.OS, a arch.Arch) (*SyscallTable, bool) { 369 for _, s := range allSyscallTables { 370 if s.OS == os && s.Arch == a { 371 return s, true 372 } 373 } 374 return nil, false 375 } 376 377 // RegisterSyscallTable registers a new syscall table for use by a Kernel. 378 func RegisterSyscallTable(s *SyscallTable) { 379 if max := s.MaxSysno(); max > sentry.MaxSyscallNum { 380 panic(fmt.Sprintf("SyscallTable %+v contains too large syscall number %d", s, max)) 381 } 382 if _, ok := LookupSyscallTable(s.OS, s.Arch); ok { 383 panic(fmt.Sprintf("Duplicate SyscallTable registered for OS %v Arch %v", s.OS, s.Arch)) 384 } 385 allSyscallTables = append(allSyscallTables, s) 386 unimplementedSyscallCounterInit.Do(func() { 387 allowedValues := make([]*metric.FieldValue, sentry.MaxSyscallNum+2) 388 unimplementedSyscallNumbers = make(map[uintptr][]*metric.FieldValue, len(allowedValues)) 389 for i := uintptr(0); i <= sentry.MaxSyscallNum; i++ { 390 s := &metric.FieldValue{strconv.Itoa(int(i))} 391 allowedValues[i] = s 392 unimplementedSyscallNumbers[i] = []*metric.FieldValue{s} 393 } 394 allowedValues[len(allowedValues)-1] = outOfRangeSyscallNumber[0] 395 unimplementedSyscallCounter = metric.MustCreateNewUint64Metric("/unimplemented_syscalls", true, "Number of times the application tried to call an unimplemented syscall, broken down by syscall number", metric.NewField("sysno", allowedValues...)) 396 }) 397 s.Init() 398 } 399 400 // Init initializes the system call table. 401 // 402 // This should normally be called only during registration. 403 func (s *SyscallTable) Init() { 404 if s.Table == nil { 405 // Ensure non-nil lookup table. 406 s.Table = make(map[uintptr]Syscall) 407 } 408 if s.Emulate == nil { 409 // Ensure non-nil emulate table. 410 s.Emulate = make(map[hostarch.Addr]uintptr) 411 } 412 413 // Initialize the fast-lookup tables. 414 for num, sc := range s.Table { 415 s.lookup[num] = sc.Fn 416 } 417 for num, sc := range s.Table { 418 s.pointCallbacks[num] = sc.PointCallback 419 } 420 421 // Initialize all features. 422 s.FeatureEnable.init(s.Table) 423 } 424 425 // Lookup returns the syscall implementation, if one exists. 426 func (s *SyscallTable) Lookup(sysno uintptr) SyscallFn { 427 if sysno <= sentry.MaxSyscallNum { 428 return s.lookup[sysno] 429 } 430 return nil 431 } 432 433 // LookupName looks up a syscall name. 434 func (s *SyscallTable) LookupName(sysno uintptr) string { 435 if sc, ok := s.Table[sysno]; ok { 436 return sc.Name 437 } 438 return fmt.Sprintf("sys_%d", sysno) // Unlikely. 439 } 440 441 // LookupNo looks up a syscall number by name. 442 func (s *SyscallTable) LookupNo(name string) (uintptr, error) { 443 for i, syscall := range s.Table { 444 if syscall.Name == name { 445 return uintptr(i), nil 446 } 447 } 448 return 0, fmt.Errorf("syscall %q not found", name) 449 } 450 451 // LookupEmulate looks up an emulation syscall number. 452 func (s *SyscallTable) LookupEmulate(addr hostarch.Addr) (uintptr, bool) { 453 sysno, ok := s.Emulate[addr] 454 return sysno, ok 455 } 456 457 // mapLookup is similar to Lookup, except that it only uses the syscall table, 458 // that is, it skips the fast look array. This is available for benchmarking. 459 func (s *SyscallTable) mapLookup(sysno uintptr) SyscallFn { 460 if sc, ok := s.Table[sysno]; ok { 461 return sc.Fn 462 } 463 return nil 464 } 465 466 // LookupSyscallToProto looks up the SyscallToProto callback for the given 467 // syscall. It may return nil if none is registered. 468 func (s *SyscallTable) LookupSyscallToProto(sysno uintptr) SyscallToProto { 469 if sysno > sentry.MaxSyscallNum { 470 return nil 471 } 472 return s.pointCallbacks[sysno] 473 } 474 475 // SyscallToProto is a callback function that converts generic syscall data to 476 // schematized protobuf for the corresponding syscall. 477 type SyscallToProto func(*Task, seccheck.FieldSet, *pb.ContextData, SyscallInfo) (proto.Message, pb.MessageType) 478 479 // SyscallInfo provides generic information about the syscall. 480 type SyscallInfo struct { 481 Exit bool 482 Sysno uintptr 483 Args arch.SyscallArguments 484 Rval uintptr 485 Errno int 486 } 487 488 // IncrementUnimplementedSyscallCounter increments the "unimplemented syscall" metric for the given 489 // syscall number. 490 // A syscall table must have been initialized prior to calling this function. 491 // +checkescape:all 492 // 493 //go:nosplit 494 func IncrementUnimplementedSyscallCounter(sysno uintptr) { 495 s, found := unimplementedSyscallNumbers[sysno] 496 if !found { 497 s = outOfRangeSyscallNumber 498 } 499 unimplementedSyscallCounter.Increment(s...) 500 }