github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/kernel/syscalls.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 import ( 18 "fmt" 19 "strconv" 20 21 "google.golang.org/protobuf/proto" 22 "github.com/nicocha30/gvisor-ligolo/pkg/abi" 23 "github.com/nicocha30/gvisor-ligolo/pkg/atomicbitops" 24 "github.com/nicocha30/gvisor-ligolo/pkg/bits" 25 "github.com/nicocha30/gvisor-ligolo/pkg/hostarch" 26 "github.com/nicocha30/gvisor-ligolo/pkg/metric" 27 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/arch" 28 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/seccheck" 29 pb "github.com/nicocha30/gvisor-ligolo/pkg/sentry/seccheck/points/points_go_proto" 30 "github.com/nicocha30/gvisor-ligolo/pkg/sync" 31 ) 32 33 const ( 34 // maxSyscallNum is the highest supported syscall number. 35 // 36 // The types below create fast lookup slices for all syscalls. This maximum 37 // serves as a sanity check that we don't allocate huge slices for a very large 38 // syscall. This is checked during registration. 39 // LINT.IfChange 40 maxSyscallNum = 2000 41 // LINT.ThenChange(../seccheck/syscall.go) 42 ) 43 44 // outOfRangeSyscallNumber is used to represent a syscall number that is out of the 45 // range [0, maxSyscallNum] in monitoring. 46 var outOfRangeSyscallNumber = []*metric.FieldValue{&metric.FieldValue{"-1"}} 47 48 // SyscallSupportLevel is a syscall support levels. 49 type SyscallSupportLevel int 50 51 // String returns a human readable represetation of the support level. 52 func (l SyscallSupportLevel) String() string { 53 switch l { 54 case SupportUnimplemented: 55 return "Unimplemented" 56 case SupportPartial: 57 return "Partial Support" 58 case SupportFull: 59 return "Full Support" 60 default: 61 return "Undocumented" 62 } 63 } 64 65 const ( 66 // SupportUndocumented indicates the syscall is not documented yet. 67 SupportUndocumented = iota 68 69 // SupportUnimplemented indicates the syscall is unimplemented. 70 SupportUnimplemented 71 72 // SupportPartial indicates the syscall is partially supported. 73 SupportPartial 74 75 // SupportFull indicates the syscall is fully supported. 76 SupportFull 77 ) 78 79 // Syscall includes the syscall implementation and compatibility information. 80 type Syscall struct { 81 // Name is the syscall name. 82 Name string 83 // Fn is the implementation of the syscall. 84 Fn SyscallFn 85 // SupportLevel is the level of support implemented in gVisor. 86 SupportLevel SyscallSupportLevel 87 // Note describes the compatibility of the syscall. 88 Note string 89 // URLs is set of URLs to any relevant bugs or issues. 90 URLs []string 91 // PointCallback is an optional callback that converts syscall arguments 92 // to a proto that can be used with seccheck.Sink. 93 // Callback functions must follow this naming convention: 94 // PointSyscallNameInCamelCase, e.g. PointReadat, PointRtSigaction. 95 PointCallback SyscallToProto 96 } 97 98 // SyscallFn is a syscall implementation. 99 type SyscallFn func(t *Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *SyscallControl, error) 100 101 // MissingFn is a syscall to be called when an implementation is missing. 102 type MissingFn func(t *Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) 103 104 // Possible flags for SyscallFlagsTable.enable. 105 const ( 106 // syscallPresent indicates that this is not a missing syscall. 107 // 108 // This flag is used internally in SyscallFlagsTable. 109 syscallPresent = 1 << iota 110 111 // StraceEnableLog enables syscall log tracing. 112 StraceEnableLog 113 114 // StraceEnableEvent enables syscall event tracing. 115 StraceEnableEvent 116 117 // ExternalBeforeEnable enables the external hook before syscall execution. 118 ExternalBeforeEnable 119 120 // ExternalAfterEnable enables the external hook after syscall execution. 121 ExternalAfterEnable 122 123 // SecCheckEnter represents a schematized/enter syscall seccheck event. 124 SecCheckEnter 125 126 // SecCheckExit represents a schematized/exit syscall seccheck event. 127 SecCheckExit 128 129 // SecCheckRawEnter represents raw/enter syscall seccheck event. 130 SecCheckRawEnter 131 132 // SecCheckRawExit represents raw/exit syscall seccheck event. 133 SecCheckRawExit 134 ) 135 136 // StraceEnableBits combines both strace log and event flags. 137 const StraceEnableBits = StraceEnableLog | StraceEnableEvent 138 139 // SyscallFlagsTable manages a set of enable/disable bit fields on a per-syscall 140 // basis. 141 type SyscallFlagsTable struct { 142 // mu protects writes to the fields below. 143 // 144 // Atomic loads are always allowed. Atomic stores are allowed only 145 // while mu is held. 146 mu sync.Mutex 147 148 // enable contains the enable bits for each syscall. 149 // 150 // missing syscalls have the same value in enable as missingEnable to 151 // avoid an extra branch in Word. 152 enable [maxSyscallNum + 1]atomicbitops.Uint32 153 154 // missingEnable contains the enable bits for missing syscalls. 155 missingEnable atomicbitops.Uint32 156 } 157 158 // Init initializes the struct, with all syscalls in table set to enable. 159 // 160 // max is the largest syscall number in table. 161 func (e *SyscallFlagsTable) init(table map[uintptr]Syscall) { 162 for num := range table { 163 enableFlags := uint32(syscallPresent) 164 e.enable[num] = atomicbitops.FromUint32(enableFlags) 165 } 166 seccheck.Global.AddSyscallFlagListener(e) 167 e.UpdateSecCheck(&seccheck.Global) 168 } 169 170 // UpdateSecCheck implements seccheck.SyscallFlagListener. 171 // 172 // It is called when per-syscall seccheck event enablement changes. 173 func (e *SyscallFlagsTable) UpdateSecCheck(state *seccheck.State) { 174 e.mu.Lock() 175 defer e.mu.Unlock() 176 for sysno := uintptr(0); sysno < maxSyscallNum; sysno++ { 177 oldFlags := e.enable[sysno].Load() 178 if !bits.IsOn32(oldFlags, syscallPresent) { 179 continue 180 } 181 flags := oldFlags 182 if state.SyscallEnabled(seccheck.SyscallEnter, sysno) { 183 flags |= SecCheckEnter 184 } else { 185 flags &^= SecCheckEnter 186 } 187 if state.SyscallEnabled(seccheck.SyscallExit, sysno) { 188 flags |= SecCheckExit 189 } else { 190 flags &^= SecCheckExit 191 } 192 if state.SyscallEnabled(seccheck.SyscallRawEnter, sysno) { 193 flags |= SecCheckRawEnter 194 } else { 195 flags &^= SecCheckRawEnter 196 } 197 if state.SyscallEnabled(seccheck.SyscallRawExit, sysno) { 198 flags |= SecCheckRawExit 199 } else { 200 flags &^= SecCheckRawExit 201 } 202 if flags != oldFlags { 203 e.enable[sysno].Store(flags) 204 } 205 } 206 } 207 208 // Word returns the enable bitfield for sysno. 209 func (e *SyscallFlagsTable) Word(sysno uintptr) uint32 { 210 if sysno <= maxSyscallNum { 211 return e.enable[sysno].Load() 212 } 213 return e.missingEnable.Load() 214 } 215 216 // Enable sets enable bit `bit` for all syscalls based on s. 217 // 218 // Syscalls missing from `s` are disabled. 219 // 220 // Syscalls missing from the initial table passed to Init cannot be added as 221 // individual syscalls. If present in s they will be ignored. 222 // 223 // Callers to Word may see either the old or new value while this function 224 // is executing. 225 func (e *SyscallFlagsTable) Enable(bit uint32, s map[uintptr]bool, missingEnable bool) { 226 e.mu.Lock() 227 defer e.mu.Unlock() 228 229 missingVal := e.missingEnable.Load() 230 if missingEnable { 231 missingVal |= bit 232 } else { 233 missingVal &^= bit 234 } 235 e.missingEnable.Store(missingVal) 236 237 for num := range e.enable { 238 val := e.enable[num].Load() 239 if !bits.IsOn32(val, syscallPresent) { 240 // Missing. 241 e.enable[num].Store(missingVal) 242 continue 243 } 244 245 if s[uintptr(num)] { 246 val |= bit 247 } else { 248 val &^= bit 249 } 250 e.enable[num].Store(val) 251 } 252 } 253 254 // EnableAll sets enable bit bit for all syscalls, present and missing. 255 func (e *SyscallFlagsTable) EnableAll(bit uint32) { 256 e.mu.Lock() 257 defer e.mu.Unlock() 258 259 missingVal := e.missingEnable.Load() 260 missingVal |= bit 261 e.missingEnable.Store(missingVal) 262 263 for num := range e.enable { 264 val := e.enable[num].Load() 265 if !bits.IsOn32(val, syscallPresent) { 266 // Missing. 267 e.enable[num].Store(missingVal) 268 continue 269 } 270 271 val |= bit 272 e.enable[num].Store(val) 273 } 274 } 275 276 // Stracer traces syscall execution. 277 type Stracer interface { 278 // SyscallEnter is called on syscall entry. 279 // 280 // The returned private data is passed to SyscallExit. 281 SyscallEnter(t *Task, sysno uintptr, args arch.SyscallArguments, flags uint32) any 282 283 // SyscallExit is called on syscall exit. 284 SyscallExit(context any, t *Task, sysno, rval uintptr, err error) 285 } 286 287 // SyscallTable is a lookup table of system calls. 288 // 289 // Note that a SyscallTable is not savable directly. Instead, they are saved as 290 // an OS/Arch pair and lookup happens again on restore. 291 type SyscallTable struct { 292 // OS is the operating system that this syscall table implements. 293 OS abi.OS 294 295 // Arch is the architecture that this syscall table targets. 296 Arch arch.Arch 297 298 // The OS version that this syscall table implements. 299 Version Version 300 301 // AuditNumber is a numeric constant that represents the syscall table. If 302 // non-zero, auditNumber must be one of the AUDIT_ARCH_* values defined by 303 // linux/audit.h. 304 AuditNumber uint32 305 306 // Table is the collection of functions. 307 Table map[uintptr]Syscall 308 309 // lookup is a fixed-size array that holds the syscalls (indexed by 310 // their numbers). It is used for fast look ups. 311 lookup [maxSyscallNum + 1]SyscallFn 312 313 // pointCallbacks is a fixed-size array that holds SyscallToProto callbacks 314 // (indexed by syscall numbers). It is used for fast lookups when 315 // seccheck.Point is enabled for the syscall. 316 pointCallbacks [maxSyscallNum + 1]SyscallToProto 317 318 // Emulate is a collection of instruction addresses to emulate. The 319 // keys are addresses, and the values are system call numbers. 320 Emulate map[hostarch.Addr]uintptr 321 322 // The function to call in case of a missing system call. 323 Missing MissingFn 324 325 // Stracer traces this syscall table. 326 Stracer Stracer 327 328 // External is used to handle an external callback. 329 External func(*Kernel) 330 331 // ExternalFilterBefore is called before External is called before the syscall is executed. 332 // External is not called if it returns false. 333 ExternalFilterBefore func(*Task, uintptr, arch.SyscallArguments) bool 334 335 // ExternalFilterAfter is called before External is called after the syscall is executed. 336 // External is not called if it returns false. 337 ExternalFilterAfter func(*Task, uintptr, arch.SyscallArguments) bool 338 339 // FeatureEnable stores the strace and one-shot enable bits. 340 FeatureEnable SyscallFlagsTable 341 } 342 343 // MaxSysno returns the largest system call number. 344 func (s *SyscallTable) MaxSysno() (max uintptr) { 345 for num := range s.Table { 346 if num > max { 347 max = num 348 } 349 } 350 return max 351 } 352 353 // allSyscallTables contains all known tables. 354 var allSyscallTables []*SyscallTable 355 356 var ( 357 // unimplementedSyscallCounterInit ensures the following fields are only initialized once. 358 unimplementedSyscallCounterInit sync.Once 359 360 // unimplementedSyscallNumbers maps syscall numbers to their string representation. 361 // Used such that incrementing unimplementedSyscallCounter does not require allocating memory. 362 // Each element in the mapped slices are of length 1, as there is only one field for the 363 // unimplemented syscall counter metric. Allocating a slice is necessary as it is passed as a 364 // variadic argument to the metric library. 365 unimplementedSyscallNumbers map[uintptr][]*metric.FieldValue 366 367 // unimplementedSyscallCounter tracks the number of times each unimplemented syscall has been 368 // called by the sandboxed application. 369 unimplementedSyscallCounter *metric.Uint64Metric 370 ) 371 372 // SyscallTables returns a read-only slice of registered SyscallTables. 373 func SyscallTables() []*SyscallTable { 374 return allSyscallTables 375 } 376 377 // LookupSyscallTable returns the SyscallCall table for the OS/Arch combination. 378 func LookupSyscallTable(os abi.OS, a arch.Arch) (*SyscallTable, bool) { 379 for _, s := range allSyscallTables { 380 if s.OS == os && s.Arch == a { 381 return s, true 382 } 383 } 384 return nil, false 385 } 386 387 // RegisterSyscallTable registers a new syscall table for use by a Kernel. 388 func RegisterSyscallTable(s *SyscallTable) { 389 if max := s.MaxSysno(); max > maxSyscallNum { 390 panic(fmt.Sprintf("SyscallTable %+v contains too large syscall number %d", s, max)) 391 } 392 if _, ok := LookupSyscallTable(s.OS, s.Arch); ok { 393 panic(fmt.Sprintf("Duplicate SyscallTable registered for OS %v Arch %v", s.OS, s.Arch)) 394 } 395 allSyscallTables = append(allSyscallTables, s) 396 unimplementedSyscallCounterInit.Do(func() { 397 allowedValues := make([]*metric.FieldValue, maxSyscallNum+2) 398 unimplementedSyscallNumbers = make(map[uintptr][]*metric.FieldValue, len(allowedValues)) 399 for i := uintptr(0); i <= maxSyscallNum; i++ { 400 s := &metric.FieldValue{strconv.Itoa(int(i))} 401 allowedValues[i] = s 402 unimplementedSyscallNumbers[i] = []*metric.FieldValue{s} 403 } 404 allowedValues[len(allowedValues)-1] = outOfRangeSyscallNumber[0] 405 unimplementedSyscallCounter = metric.MustCreateNewUint64Metric("/unimplemented_syscalls", true, "Number of times the application tried to call an unimplemented syscall, broken down by syscall number", metric.NewField("sysno", allowedValues...)) 406 }) 407 s.Init() 408 } 409 410 // Init initializes the system call table. 411 // 412 // This should normally be called only during registration. 413 func (s *SyscallTable) Init() { 414 if s.Table == nil { 415 // Ensure non-nil lookup table. 416 s.Table = make(map[uintptr]Syscall) 417 } 418 if s.Emulate == nil { 419 // Ensure non-nil emulate table. 420 s.Emulate = make(map[hostarch.Addr]uintptr) 421 } 422 423 // Initialize the fast-lookup tables. 424 for num, sc := range s.Table { 425 s.lookup[num] = sc.Fn 426 } 427 for num, sc := range s.Table { 428 s.pointCallbacks[num] = sc.PointCallback 429 } 430 431 // Initialize all features. 432 s.FeatureEnable.init(s.Table) 433 } 434 435 // Lookup returns the syscall implementation, if one exists. 436 func (s *SyscallTable) Lookup(sysno uintptr) SyscallFn { 437 if sysno <= maxSyscallNum { 438 return s.lookup[sysno] 439 } 440 return nil 441 } 442 443 // LookupName looks up a syscall name. 444 func (s *SyscallTable) LookupName(sysno uintptr) string { 445 if sc, ok := s.Table[sysno]; ok { 446 return sc.Name 447 } 448 return fmt.Sprintf("sys_%d", sysno) // Unlikely. 449 } 450 451 // LookupNo looks up a syscall number by name. 452 func (s *SyscallTable) LookupNo(name string) (uintptr, error) { 453 for i, syscall := range s.Table { 454 if syscall.Name == name { 455 return uintptr(i), nil 456 } 457 } 458 return 0, fmt.Errorf("syscall %q not found", name) 459 } 460 461 // LookupEmulate looks up an emulation syscall number. 462 func (s *SyscallTable) LookupEmulate(addr hostarch.Addr) (uintptr, bool) { 463 sysno, ok := s.Emulate[addr] 464 return sysno, ok 465 } 466 467 // mapLookup is similar to Lookup, except that it only uses the syscall table, 468 // that is, it skips the fast look array. This is available for benchmarking. 469 func (s *SyscallTable) mapLookup(sysno uintptr) SyscallFn { 470 if sc, ok := s.Table[sysno]; ok { 471 return sc.Fn 472 } 473 return nil 474 } 475 476 // LookupSyscallToProto looks up the SyscallToProto callback for the given 477 // syscall. It may return nil if none is registered. 478 func (s *SyscallTable) LookupSyscallToProto(sysno uintptr) SyscallToProto { 479 if sysno > maxSyscallNum { 480 return nil 481 } 482 return s.pointCallbacks[sysno] 483 } 484 485 // SyscallToProto is a callback function that converts generic syscall data to 486 // schematized protobuf for the corresponding syscall. 487 type SyscallToProto func(*Task, seccheck.FieldSet, *pb.ContextData, SyscallInfo) (proto.Message, pb.MessageType) 488 489 // SyscallInfo provides generic information about the syscall. 490 type SyscallInfo struct { 491 Exit bool 492 Sysno uintptr 493 Args arch.SyscallArguments 494 Rval uintptr 495 Errno int 496 } 497 498 // IncrementUnimplementedSyscallCounter increments the "unimplemented syscall" metric for the given 499 // syscall number. 500 // A syscall table must have been initialized prior to calling this function. 501 // +checkescape:all 502 // 503 //go:nosplit 504 func IncrementUnimplementedSyscallCounter(sysno uintptr) { 505 s, found := unimplementedSyscallNumbers[sysno] 506 if !found { 507 s = outOfRangeSyscallNumber 508 } 509 unimplementedSyscallCounter.Increment(s...) 510 }